Distributed applications

Ulf Wiger etxuwig@REDACTED
Thu Mar 9 15:40:52 CET 2000


> From: Sean Hinde <Sean.Hinde@REDACTED>
> 
> > >And, more important, distributed applications doesn't work
> > >if you start the applications manually from the shell using
> > >application:start(App), you have to generate a boot script
> > >which starts the application automatically. (I'm investigating
> > >why this is the case, as part of the ongoing work we are doing
> > >on release handling).
> > 
> > Ahhh, this is what I am trying to do. I'll try it in my embedded test
> > environment.
> > 
> 
> Tried it and it works. I've managed to fool it though by pausing the master
> node using CTRL c for a minute or so. The node gets a big red cross in
> appmon and the app starts up elsewhere. If I re-introduce the first node by
> just selecting (c)ontinue from the CTRL c menu, and re-establish contact
> with net_adm:ping/1, the application continues to exist on both nodes. I
> guess this is testing partitioned network behaviour. Any suggestions of how
> to get around this?

What you've accomplished is, as you've noted, a partitioned network.
The distributed application controller (dist_ac) doesn't attempt
to resynchronize in this case (when both nodes think that the 
other node restarted.) What you can do, if you want to handle the
situation yourself, is to write your own program to detect that 
the network has been partitioned, and take appropriate action
(this involves selecting one node for termination, and making it
reboot.)

One technique that I've found useful is illustrated by the 
attached program:

- A simple gen_server using a hail protocol to find out which
  nodes are available
- Upon {nodeup, N}, send a cast to node N; if the cast reaches
  the corresp. monitor process on N, it's most likely a partitioned
  network (this assumes that it takes considerably longer for a node
  to start, than it does for message passing between nodes.)
- For the program to be really safe, it should be started under a 
  supervisor with {maxR, 0}, so that the node restarts if the 
  monitor process crashes.

/Uffe

Ulf Wiger, Chief Designer AXD 301         <ulf.wiger@REDACTED>
Ericsson Telecom AB                          tfn: +46  8 719 81 95
Varuvägen 9, Älvsjö                          mob: +46 70 519 81 95
S-126 25 Stockholm, Sweden                   fax: +46  8 719 43 44
-------------- next part --------------
%%%----------------------------------------------------------------------
%%% File    : nodemon.erl
%%% Author  : Ulf Wiger <etxuwig@REDACTED>
%%% Purpose : Detect partitioned networks
%%% Created :  9 Mar 2000 by Ulf Wiger <etxuwig@REDACTED>
%%%----------------------------------------------------------------------

-module(nodemon).
-author('etxuwig@REDACTED').


%% External exports
-export([start_link/0,
	 get_nodes/0]).

%% gen_server callbacks
-export([init/1, 
	 handle_call/3, handle_cast/2, handle_info/2, 
	 terminate/2,
	 code_change/3]).

-record(state, {nodes = []}).

%%%----------------------------------------------------------------------
%%% API
%%%----------------------------------------------------------------------
start_link() ->
    gen_server:start_link({local, ?MODULE}, 
			  ?MODULE, [], [{debug,[trace]}]).

get_nodes() ->
    gen_server:multi_call(?MODULE, get_nodes).

%%%----------------------------------------------------------------------
%%% Callback functions from gen_server
%%%----------------------------------------------------------------------

%%----------------------------------------------------------------------
%% Func: init/1
%% Returns: {ok, State}          |
%%          {ok, State, Timeout} |
%%          {stop, Reason}
%%----------------------------------------------------------------------
init([]) ->
    net_kernel:monitor_nodes(true),
    [{?MODULE, N} ! {?MODULE, hail, node()} || N <- nodes()],
    {ok, #state{}}.

%%----------------------------------------------------------------------
%% Func: handle_call/3
%% Returns: {reply, Reply, State}          |
%%          {reply, Reply, State, Timeout} |
%%          {noreply, State}               |
%%          {noreply, State, Timeout}      |
%%          {stop, Reason, Reply, State}   | (terminate/2 is called)
%%          {stop, Reason, Reply, State}     (terminate/2 is called)
%%----------------------------------------------------------------------
handle_call(get_nodes, From, S = #state{nodes = Ns}) ->
    {reply, Ns, S}.

%%----------------------------------------------------------------------
%% Func: handle_cast/2
%% Returns: {noreply, State}          |
%%          {noreply, State, Timeout} |
%%          {stop, Reason, State}            (terminate/2 is called)
%%----------------------------------------------------------------------
handle_cast({maybe_partitioned_net, Node}, S = #state{nodes = Ns}) ->
    case lists:member(Node, Ns) of
	false ->
	    io:format("*****~n"
		      "***** Partitioned network (~p,~p)~n"
		      "*****~n", [node(),Node]);
	true ->
	    ok
    end,
    %% I don't bother to update state here. What we do from now on depends
    %% on the application.
    {noreply, S}.

%%----------------------------------------------------------------------
%% Func: handle_info/2
%% Returns: {noreply, State}          |
%%          {noreply, State, Timeout} |
%%          {stop, Reason, State}            (terminate/2 is called)
%%----------------------------------------------------------------------
handle_info({?MODULE, hail, FromNode}, S = #state{nodes = Ns}) ->
    {?MODULE, FromNode} ! {?MODULE, hail_ack, node()},
    {noreply, S#state{nodes = [FromNode|Ns -- [FromNode]]}};
handle_info({?MODULE, hail_ack, FromNode}, S = #state{nodes = Ns}) ->
    {noreply, S#state{nodes = [FromNode|Ns -- [FromNode]]}};

handle_info({nodedown, N}, S = #state{nodes = Ns}) ->
    {noreply, S#state{nodes = Ns -- [N]}};
handle_info({nodeup, N}, S = #state{nodes = Ns}) ->
    gen_server:cast({?MODULE, N}, {maybe_partitioned_net, node()}),
    {noreply, S};
handle_info(Msg, State) ->
    {noreply, State}.

%%----------------------------------------------------------------------
%% Func: terminate/2
%% Purpose: Shutdown the server
%% Returns: any (ignored by gen_server)
%%----------------------------------------------------------------------
terminate(Reason, State) ->
    ok.

%%----------------------------------------------------------------------
%% Func: code_change/3
%% Purpose: Upgrade internal state
%% Returns: {ok, NewState}
%%----------------------------------------------------------------------
code_change(OldVsn, State, Extra) ->
    {ok, State}.

%%%----------------------------------------------------------------------
%%% Internal functions
%%%----------------------------------------------------------------------


More information about the erlang-questions mailing list