Distributed applications
Ulf Wiger
etxuwig@REDACTED
Thu Mar 9 15:40:52 CET 2000
> From: Sean Hinde <Sean.Hinde@REDACTED>
>
> > >And, more important, distributed applications doesn't work
> > >if you start the applications manually from the shell using
> > >application:start(App), you have to generate a boot script
> > >which starts the application automatically. (I'm investigating
> > >why this is the case, as part of the ongoing work we are doing
> > >on release handling).
> >
> > Ahhh, this is what I am trying to do. I'll try it in my embedded test
> > environment.
> >
>
> Tried it and it works. I've managed to fool it though by pausing the master
> node using CTRL c for a minute or so. The node gets a big red cross in
> appmon and the app starts up elsewhere. If I re-introduce the first node by
> just selecting (c)ontinue from the CTRL c menu, and re-establish contact
> with net_adm:ping/1, the application continues to exist on both nodes. I
> guess this is testing partitioned network behaviour. Any suggestions of how
> to get around this?
What you've accomplished is, as you've noted, a partitioned network.
The distributed application controller (dist_ac) doesn't attempt
to resynchronize in this case (when both nodes think that the
other node restarted.) What you can do, if you want to handle the
situation yourself, is to write your own program to detect that
the network has been partitioned, and take appropriate action
(this involves selecting one node for termination, and making it
reboot.)
One technique that I've found useful is illustrated by the
attached program:
- A simple gen_server using a hail protocol to find out which
nodes are available
- Upon {nodeup, N}, send a cast to node N; if the cast reaches
the corresp. monitor process on N, it's most likely a partitioned
network (this assumes that it takes considerably longer for a node
to start, than it does for message passing between nodes.)
- For the program to be really safe, it should be started under a
supervisor with {maxR, 0}, so that the node restarts if the
monitor process crashes.
/Uffe
Ulf Wiger, Chief Designer AXD 301 <ulf.wiger@REDACTED>
Ericsson Telecom AB tfn: +46 8 719 81 95
Varuvägen 9, Älvsjö mob: +46 70 519 81 95
S-126 25 Stockholm, Sweden fax: +46 8 719 43 44
-------------- next part --------------
%%%----------------------------------------------------------------------
%%% File : nodemon.erl
%%% Author : Ulf Wiger <etxuwig@REDACTED>
%%% Purpose : Detect partitioned networks
%%% Created : 9 Mar 2000 by Ulf Wiger <etxuwig@REDACTED>
%%%----------------------------------------------------------------------
-module(nodemon).
-author('etxuwig@REDACTED').
%% External exports
-export([start_link/0,
get_nodes/0]).
%% gen_server callbacks
-export([init/1,
handle_call/3, handle_cast/2, handle_info/2,
terminate/2,
code_change/3]).
-record(state, {nodes = []}).
%%%----------------------------------------------------------------------
%%% API
%%%----------------------------------------------------------------------
start_link() ->
gen_server:start_link({local, ?MODULE},
?MODULE, [], [{debug,[trace]}]).
get_nodes() ->
gen_server:multi_call(?MODULE, get_nodes).
%%%----------------------------------------------------------------------
%%% Callback functions from gen_server
%%%----------------------------------------------------------------------
%%----------------------------------------------------------------------
%% Func: init/1
%% Returns: {ok, State} |
%% {ok, State, Timeout} |
%% {stop, Reason}
%%----------------------------------------------------------------------
init([]) ->
net_kernel:monitor_nodes(true),
[{?MODULE, N} ! {?MODULE, hail, node()} || N <- nodes()],
{ok, #state{}}.
%%----------------------------------------------------------------------
%% Func: handle_call/3
%% Returns: {reply, Reply, State} |
%% {reply, Reply, State, Timeout} |
%% {noreply, State} |
%% {noreply, State, Timeout} |
%% {stop, Reason, Reply, State} | (terminate/2 is called)
%% {stop, Reason, Reply, State} (terminate/2 is called)
%%----------------------------------------------------------------------
handle_call(get_nodes, From, S = #state{nodes = Ns}) ->
{reply, Ns, S}.
%%----------------------------------------------------------------------
%% Func: handle_cast/2
%% Returns: {noreply, State} |
%% {noreply, State, Timeout} |
%% {stop, Reason, State} (terminate/2 is called)
%%----------------------------------------------------------------------
handle_cast({maybe_partitioned_net, Node}, S = #state{nodes = Ns}) ->
case lists:member(Node, Ns) of
false ->
io:format("*****~n"
"***** Partitioned network (~p,~p)~n"
"*****~n", [node(),Node]);
true ->
ok
end,
%% I don't bother to update state here. What we do from now on depends
%% on the application.
{noreply, S}.
%%----------------------------------------------------------------------
%% Func: handle_info/2
%% Returns: {noreply, State} |
%% {noreply, State, Timeout} |
%% {stop, Reason, State} (terminate/2 is called)
%%----------------------------------------------------------------------
handle_info({?MODULE, hail, FromNode}, S = #state{nodes = Ns}) ->
{?MODULE, FromNode} ! {?MODULE, hail_ack, node()},
{noreply, S#state{nodes = [FromNode|Ns -- [FromNode]]}};
handle_info({?MODULE, hail_ack, FromNode}, S = #state{nodes = Ns}) ->
{noreply, S#state{nodes = [FromNode|Ns -- [FromNode]]}};
handle_info({nodedown, N}, S = #state{nodes = Ns}) ->
{noreply, S#state{nodes = Ns -- [N]}};
handle_info({nodeup, N}, S = #state{nodes = Ns}) ->
gen_server:cast({?MODULE, N}, {maybe_partitioned_net, node()}),
{noreply, S};
handle_info(Msg, State) ->
{noreply, State}.
%%----------------------------------------------------------------------
%% Func: terminate/2
%% Purpose: Shutdown the server
%% Returns: any (ignored by gen_server)
%%----------------------------------------------------------------------
terminate(Reason, State) ->
ok.
%%----------------------------------------------------------------------
%% Func: code_change/3
%% Purpose: Upgrade internal state
%% Returns: {ok, NewState}
%%----------------------------------------------------------------------
code_change(OldVsn, State, Extra) ->
{ok, State}.
%%%----------------------------------------------------------------------
%%% Internal functions
%%%----------------------------------------------------------------------
More information about the erlang-questions
mailing list