[erlang-bugs] Problem with tw timer support in diameter app (otp_R16B)

Aleksander Nycz Aleksander.Nycz@REDACTED
Mon May 20 10:41:46 CEST 2013


Hello,

I change default value for param *restrict_connections *from 'nodes' to 
'false'.
After that I run very simple test using seagull symulator. Test scenario 
was following:

1. seagull: send CER
2. seagull: recv CEA
3. seagull: send CCR (init)
4. seagull: recv CCA (init)
5. seagull: send CCR (update)
6. seagull: recv CCR (update)
7. seagull: send CCR (terminate)
8. seagull: recv CCA (terminate)

After step 8. seagull does't send DPR, but just closes transport 
connection (TCP)

On server side every think looks good, but 30 sec. after CCR (terminate) 
when tw elapsed, following error message appears in log:


13:40:58.187129: <0.5046.0>: error: error_logger: --:--/--: ** Generic 
server <0.5046.0> terminating
** Last message in was {timeout,#Ref<0.0.0.14845>,tw}
** When Server state == {watchdog,down,false,30000,0,<0.1009.0>,undefined,
#Ref<0.0.0.14845>,diameter_gen_base_rfc3588,
                             {recvdata,4259932,diameterNode,
[{diameter_app,diameterNode,dictionaryDCCA,
                                      [dccaCallback],
                                      diameterNode,4,false,
                                      [{answer_errors,report},
{request_errors,answer_3xxx}]}],
                                 {0,32}},
                             {0,32},
                             {false,false},
                             false}
** Reason for termination ==
** {function_clause,
        [{diameter_watchdog,set_watchdog,
             [stop],
             [{file,"base/diameter_watchdog.erl"},{line,451}]},
         {diameter_watchdog,handle_info,2,
             [{file,"base/diameter_watchdog.erl"},{line,211}]},
{gen_server,handle_msg,5,[{file,"gen_server.erl"},{line,597}]},
{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}

13:40:58.187500: <0.5046.0>: error: error_logger: --:--/--: 
[crash_report][[[{initial_call,{diameter_watchdog,init,['Argument__1']}},
                  {pid,<0.5046.0>},
                  {registered_name,[]},
{error_info,{exit,{function_clause,[{diameter_watchdog,set_watchdog,[stop],[{file,"base/diameter_watchdog.erl"},{line,451}]},
{diameter_watchdog,handle_info,2,[{file,"base/diameter_watchdog.erl"},{line,211}]},
{gen_server,handle_msg,5,[{file,"gen_server.erl"},{line,597}]},
{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]},
[{gen_server,terminate,6,[{file,"gen_server.erl"},{line,737}]},
{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}},
{ancestors,[diameter_watchdog_sup,diameter_sup,<0.946.0>]},
                  {messages,[]},
                  {links,[<0.954.0>]},
                  {dictionary,[{random_seed,{15047,18051,14647}},
                               {{diameter_watchdog,restart},
                                {{accept,#Ref<0.0.0.1696>},
                                 [{transport_module,diameter_tcp},
{transport_config,[{reuseaddr,true},{ip,{0,0,0,0}},{port,4068}]},
{capabilities_cb,[#Fun<diameterNode.acceptCER.2>]},
                                  {watchdog_timer,30000},
                                  {reconnect_timer,60000}],
                                 {diameter_service,<0.1009.0>,
{diameter_caps,"zyndram.krakow.comarch","krakow.comarch",[],25429,"Comarch 
DIAMETER Server",[],
[12645,10415,8164],
[4],
[],[],[],[],[]},
[{diameter_app,diameterNode,dictionaryDCCA,
[dccaCallback],
diameterNode,4,false,
[{answer_errors,report},{request_errors,answer_3xxx}]}]}}},
                               {{diameter_watchdog,dwr},
['DWR',{'Origin-Host',"zyndram.krakow.comarch"},{'Origin-Realm',"krakow.comarch"},{'Origin-State-Id',[]}]}]},
                  {trap_exit,false},
                  {status,running},
                  {heap_size,75025},
                  {stack_size,24},
                  {reductions,294}],
                 []]]
13:40:58.189060: <0.954.0>: error: error_logger: --:--/--: 
[supervisor_report][[{supervisor,{local,diameter_watchdog_sup}},
                      {errorContext,child_terminated},
{reason,{function_clause,[{diameter_watchdog,set_watchdog,[stop],[{file,"base/diameter_watchdog.erl"},{line,451}]},
{diameter_watchdog,handle_info,2,[{file,"base/diameter_watchdog.erl"},{line,211}]},
{gen_server,handle_msg,5,[{file,"gen_server.erl"},{line,597}]},
{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}},
                      {offender,[{pid,<0.5046.0>},
                                 {name,diameter_watchdog},
{mfargs,{diameter_watchdog,start_link,undefined}},
                                 {restart_type,temporary},
                                 {shutdown,1000},
                                 {child_type,worker}]}]]

You can check, that function set_watchdog should be called with param 
#watchdog{}, but 'stop' param is used instead.
As a result function_clause exception is thrown.

I suggest following change in code to correct this problem (file 
diameter_watchdog.erl):

$ diff diameter_watchdog.erl_org diameter_watchdog.erl
385a386,393
 > transition({timeout, TRef, tw}, #watchdog{tref = TRef, status = T} = S)
 >   when T == initial;
 >        T == down ->
 >     case restart(S) of
 >         stop -> stop;
 >         #watchdog{} = NewS -> set_watchdog(NewS)
 >     end;
 >

You can find this solution in attachement.

Best regards
Aleksander Nycz

-- 
Aleksander Nycz
Senior Software Engineer
Telco_021 BSS R&D
Comarch SA
Phone:  +48 12 646 1216
Mobile: +48 691 464 275
website: www.comarch.pl

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://erlang.org/pipermail/erlang-bugs/attachments/20130520/44c7c439/attachment.htm>
-------------- next part --------------
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2010-2013. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%

%%
%% This module implements (as a process) the state machine documented
%% in Appendix A of RFC 3539.
%%

-module(diameter_watchdog).
-behaviour(gen_server).

%% towards diameter_service
-export([start/2]).

%% gen_server callbacks
-export([init/1,
         handle_call/3,
         handle_cast/2,
         handle_info/2,
         terminate/2,
         code_change/3]).

%% diameter_watchdog_sup callback
-export([start_link/1]).

-include_lib("diameter/include/diameter.hrl").
-include("diameter_internal.hrl").

-define(DEFAULT_TW_INIT, 30000). %% RFC 3539 ch 3.4.1
-define(NOMASK, {0,32}).  %% default sequence mask

-define(BASE, ?DIAMETER_DICT_COMMON).

-record(watchdog,
        {%% PCB - Peer Control Block; see RFC 3539, Appendix A
         status = initial :: initial | okay | suspect | down | reopen,
         pending = false  :: boolean(),  %% DWA
         tw :: 6000..16#FFFFFFFF | {module(), atom(), list()},
                                %% {M,F,A} -> integer() >= 0
         num_dwa = 0 :: -1 | non_neg_integer(),
                     %% number of DWAs received during reopen
         %% end PCB
         parent = self() :: pid(),              %% service process
         transport       :: pid() | undefined,  %% peer_fsm process
         tref :: reference(),     %% reference for current watchdog timer
         dictionary :: module(),  %% common dictionary
         receive_data :: term(),
                 %% term passed into diameter_service with incoming message
         sequence :: diameter:sequence(),     %% mask
         restrict :: {diameter:restriction(), boolean()},
         shutdown = false :: boolean()}).

%% ---------------------------------------------------------------------------
%% start/2
%%
%% Start a monitor before the watchdog is allowed to proceed to ensure
%% that a failed capabilities exchange produces the desired exit
%% reason.
%% ---------------------------------------------------------------------------

-spec start(Type, {RecvData, [Opt], SvcOpts, #diameter_service{}})
   -> {reference(), pid()}
 when Type :: {connect|accept, diameter:transport_ref()},
      RecvData :: term(),
      Opt :: diameter:transport_opt(),
      SvcOpts :: [diameter:service_opt()].

start({_,_} = Type, T) ->
    Ack = make_ref(),
    {ok, Pid} = diameter_watchdog_sup:start_child({Ack, Type, self(), T}),
    try
        {erlang:monitor(process, Pid), Pid}
    after
        send(Pid, Ack)
    end.

start_link(T) ->
    {ok, _} = proc_lib:start_link(?MODULE,
                                  init,
                                  [T],
                                  infinity,
                                  diameter_lib:spawn_opts(server, [])).

%% ===========================================================================
%% ===========================================================================

%% init/1

init(T) ->
    proc_lib:init_ack({ok, self()}),
    gen_server:enter_loop(?MODULE, [], i(T)).

i({Ack, T, Pid, {RecvData,
                 Opts,
                 SvcOpts,
                 #diameter_service{applications = Apps,
                                   capabilities = Caps}
                 = Svc}}) ->
    erlang:monitor(process, Pid),
    wait(Ack, Pid),
    random:seed(now()),
    putr(restart, {T, Opts, Svc}),  %% save seeing it in trace
    putr(dwr, dwr(Caps)),           %%
    {_,_} = Mask = proplists:get_value(sequence, SvcOpts),
    Restrict = proplists:get_value(restrict_connections, SvcOpts),
    Nodes = restrict_nodes(Restrict),
    Dict0 = common_dictionary(Apps),
    #watchdog{parent = Pid,
              transport = start(T, Opts, Mask, Nodes, Dict0, Svc),
              tw = proplists:get_value(watchdog_timer,
                                       Opts,
                                       ?DEFAULT_TW_INIT),
              receive_data = RecvData,
              dictionary = Dict0,
              sequence = Mask,
              restrict = {Restrict, lists:member(node(), Nodes)}}.

wait(Ref, Pid) ->
    receive
        Ref ->
            ok;
        {'DOWN', _, process, Pid, _} = D ->
            exit({shutdown, D})
    end.

%% start/5

start(T, Opts, Mask, Nodes, Dict0, Svc) ->
    {_MRef, Pid}
        = diameter_peer_fsm:start(T, Opts, {Mask, Nodes, Dict0, Svc}),
    Pid.

%% common_dictionary/1
%%
%% Determine the dictionary of the Diameter common application with
%% Application Id 0. Fail on config errors.

common_dictionary(Apps) ->
    case
        orddict:fold(fun dict0/3,
                     false,
                     lists:foldl(fun(#diameter_app{dictionary = M}, D) ->
                                         orddict:append(M:id(), M, D)
                                 end,
                                 orddict:new(),
                                 Apps))
    of
        {value, Mod} ->
            Mod;
        false ->
            %% A transport should configure a common dictionary but
            %% don't require it. Not configuring a common dictionary
            %% means a user won't be able either send of receive
            %% messages in the common dictionary: incoming request
            %% will be answered with 3007 and outgoing requests cannot
            %% be sent. The dictionary returned here is oly used for
            %% messages diameter sends and receives: CER/CEA, DPR/DPA
            %% and DWR/DWA.
            ?BASE
    end.

%% Each application should be represented by a single dictionary.
dict0(Id, [_,_|_] = Ms, _) ->
    config_error({multiple_dictionaries, Ms, {application_id, Id}});

%% An explicit common dictionary.
dict0(?APP_ID_COMMON, [Mod], _) ->
    {value, Mod};

%% A pure relay, in which case the common application is implicit.
%% This uses the fact that the common application will already have
%% been folded.
dict0(?APP_ID_RELAY, _, false) ->
    {value, ?BASE};

dict0(_, _, Acc) ->
    Acc.

config_error(T) ->
    ?ERROR({configuration_error, T}).

%% handle_call/3

handle_call(_, _, State) ->
    {reply, nok, State}.

%% handle_cast/2

handle_cast(_, State) ->
    {noreply, State}.

%% handle_info/2

handle_info(T, #watchdog{} = State) ->
    case transition(T, State) of
        ok ->
            {noreply, State};
        #watchdog{} = S ->
            close(T, State),     %% service expects 'close' message
            event(T, State, S),  %%   before 'watchdog'
            {noreply, S};
        stop ->
            ?LOG(stop, T),
            event(T, State, State#watchdog{status = down}),
            {stop, {shutdown, T}, State}
    end.

close({'DOWN', _, process, TPid, {shutdown, Reason}},
      #watchdog{transport = TPid,
                parent = Pid}) ->
    send(Pid, {close, self(), Reason});

close(_, _) ->
    ok.

event(_, #watchdog{status = T}, #watchdog{status = T}) ->
    ok;

event(_, #watchdog{transport = undefined}, #watchdog{transport = undefined}) ->
    ok;

event(Msg,
      #watchdog{status = From, transport = F, parent = Pid},
      #watchdog{status = To, transport = T}) ->
    TPid = tpid(F,T),
    E = {[TPid | data(Msg, TPid, From, To)], From, To},
    send(Pid, {watchdog, self(), E}),
    ?LOG(transition, {self(), E}).

data(Msg, TPid, reopen, okay) ->
    {recv, TPid, 'DWA', _Pkt} = Msg,  %% assert
    {TPid, T} = eraser(open),
    [T];

data({open, TPid, _Hosts, T}, TPid, _From, To)
  when To == okay;
       To == reopen ->
    [T];

data(_, _, _, _) ->
    [].

tpid(_, Pid)
  when is_pid(Pid) ->
    Pid;

tpid(Pid, _) ->
    Pid.

send(Pid, T) ->
    Pid ! T.

%% terminate/2

terminate(_, _) ->
    ok.

%% code_change/3

code_change(_, State, _) ->
    {ok, State}.

%% ===========================================================================
%% ===========================================================================

%% transition/2
%%
%% The state transitions documented here are extracted from RFC 3539,
%% the commentary is ours.

%% Service or watchdog is telling the watchdog of an accepting
%% transport to die after reconnect_timer expiry or reestablished
%% connection (in another transport process) respectively.
transition(close, #watchdog{status = down}) ->
    {{accept, _}, _, _} = getr(restart), %% assert
    stop;
transition(close, #watchdog{}) ->
    ok;

%% Service is asking for the peer to be taken down gracefully.
transition({shutdown, Pid, _}, #watchdog{parent = Pid,
                                         transport = undefined}) ->
    stop;
transition({shutdown = T, Pid, Reason}, #watchdog{parent = Pid,
                                                  transport = TPid}
                                        = S) ->
    send(TPid, {T, self(), Reason}),
    S#watchdog{shutdown = true};

%% Parent process has died,
transition({'DOWN', _, process, Pid, _Reason},
           #watchdog{parent = Pid}) ->
    stop;

%% Transport has accepted a connection.
transition({accepted = T, TPid}, #watchdog{transport = TPid,
                                           parent = Pid}) ->
    send(Pid, {T, self(), TPid}),
    ok;

%%   STATE         Event                Actions              New State
%%   =====         ------               -------              ----------
%%   INITIAL       Connection up        SetWatchdog()        OKAY

%% By construction, the watchdog timer isn't set until we move into
%% state okay as the result of the Peer State Machine reaching the
%% Open state.
%%
%% If we're accepting then we may be resuming a connection that went
%% down in another watchdog process, in which case this is the
%% transition below, from down to reopen. That is, it's not until we
%% know the identity of the peer (ie. now) that we know that we're in
%% state down rather than initial.

transition({open, TPid, Hosts, _} = Open,
           #watchdog{transport = TPid,
                     status = initial,
                     restrict = {_, R}}
           = S) ->
    case okay(getr(restart), Hosts, R) of
        okay ->
            set_watchdog(S#watchdog{status = okay});
        reopen ->
            transition(Open, S#watchdog{status = down})
    end;

%%   DOWN          Connection up        NumDWA = 0
%%                                      SendWatchdog()
%%                                      SetWatchdog()
%%                                      Pending = TRUE       REOPEN

transition({open = Key, TPid, _Hosts, T},
           #watchdog{transport = TPid,
                     status = down}
           = S) ->
    %% Store the info we need to notify the parent to reopen the
    %% connection after the requisite DWA's are received, at which
    %% time we eraser(open). The reopen message is a later addition,
    %% to communicate the new capabilities as soon as they're known.
    putr(Key, {TPid, T}),
    set_watchdog(send_watchdog(S#watchdog{status = reopen,
                                          num_dwa = 0}));

%%   OKAY          Connection down      CloseConnection()
%%                                      Failover()
%%                                      SetWatchdog()        DOWN
%%   SUSPECT       Connection down      CloseConnection()
%%                                      SetWatchdog()        DOWN
%%   REOPEN        Connection down      CloseConnection()
%%                                      SetWatchdog()        DOWN

transition({'DOWN', _, process, TPid, _Reason},
           #watchdog{transport = TPid,
                     shutdown = true}) ->
    stop;

transition({'DOWN', _, process, TPid, _Reason},
           #watchdog{transport = TPid,
                     status = T}
           = S) ->
    set_watchdog(S#watchdog{status = case T of initial -> T; _ -> down end,
                            pending = false,
                            transport = undefined});

%% Incoming message.
transition({recv, TPid, Name, Pkt}, #watchdog{transport = TPid} = S) ->
    recv(Name, Pkt, S);

%% Current watchdog has timed out.
transition({timeout, TRef, tw}, #watchdog{tref = TRef, status = T} = S)
  when T == initial;
       T == down ->
    case restart(S) of 
        stop -> stop;
        #watchdog{} = NewS -> set_watchdog(NewS)
    end;

transition({timeout, TRef, tw}, #watchdog{tref = TRef} = S) ->
    set_watchdog(timeout(S));

%% Timer was canceled after message was already sent.
transition({timeout, _, tw}, #watchdog{}) ->
    ok;

%% State query.
transition({state, Pid}, #watchdog{status = S}) ->
    send(Pid, {self(), S}),
    ok.

%% ===========================================================================

putr(Key, Val) ->
    put({?MODULE, Key}, Val).

getr(Key) ->
    get({?MODULE, Key}).

eraser(Key) ->
    erase({?MODULE, Key}).

%% encode/3

encode(Msg, Mask, Dict) ->
    Seq = diameter_session:sequence(Mask),
    Hdr = #diameter_header{version = ?DIAMETER_VERSION,
                           end_to_end_id = Seq,
                           hop_by_hop_id = Seq},
    Pkt = #diameter_packet{header = Hdr,
                           msg = Msg},
    #diameter_packet{bin = Bin} = diameter_codec:encode(Dict, Pkt),
    Bin.

%% okay/3

okay({{accept, Ref}, _, _}, Hosts, Restrict) ->
    T = {?MODULE, connection, Ref, Hosts},
    diameter_reg:add(T),
    if Restrict ->
            okay(diameter_reg:match(T));
       true ->
            okay
    end;
%% Register before matching so that at least one of two registering
%% processes will match the other.

okay({{connect, _}, _, _}, _, _) ->
    okay.

%% okay/2

%% The peer hasn't been connected recently ...
okay([{_,P}]) ->
    P = self(),  %% assert
    okay;

%% ... or it has.
okay(C) ->
    [_|_] = [send(P, close) || {_,P} <- C, self() /= P],
    reopen.

%% set_watchdog/1

set_watchdog(#watchdog{tw = TwInit,
                       tref = TRef}
             = S) ->
    cancel(TRef),
    S#watchdog{tref = erlang:start_timer(tw(TwInit), self(), tw)}.

cancel(undefined) ->
    ok;
cancel(TRef) ->
    erlang:cancel_timer(TRef).

tw(T)
  when is_integer(T), T >= 6000 ->
    T - 2000 + (random:uniform(4001) - 1); %% RFC3539 jitter of +/- 2 sec.
tw({M,F,A}) ->
    apply(M,F,A).

%% send_watchdog/1

send_watchdog(#watchdog{pending = false,
                        transport = TPid,
                        dictionary = Dict0,
                        sequence = Mask}
              = S) ->
    send(TPid, {send, encode(getr(dwr), Mask, Dict0)}),
    ?LOG(send, 'DWR'),
    S#watchdog{pending = true}.

%% recv/3

recv(Name, Pkt, S) ->
    try rcv(Name, S) of
        #watchdog{} = NS ->
            rcv(Name, Pkt, S),
            NS
    catch
        {?MODULE, throwaway, #watchdog{} = NS} ->
            NS
    end.

%% rcv/3

rcv(N, _, _)
  when N == 'CER';
       N == 'CEA';
       N == 'DWR';
       N == 'DWA';
       N == 'DPR';
       N == 'DPA' ->
    false;

rcv(_, Pkt, #watchdog{transport = TPid,
                      dictionary = Dict0,
                      receive_data = T}) ->
    diameter_traffic:receive_message(TPid, Pkt, Dict0, T).

throwaway(S) ->
    throw({?MODULE, throwaway, S}).

%% rcv/2
%%
%% The lack of Hop-by-Hop and End-to-End Identifiers checks in a
%% received DWA is intentional. The purpose of the message is to
%% demonstrate life but a peer that consistently bungles it by sending
%% the wrong identifiers causes the connection to toggle between OPEN
%% and SUSPECT, with failover and failback as result, despite there
%% being no real problem with connectivity. Thus, relax and accept any
%% incoming DWA as being in response to an outgoing DWR.

%%   INITIAL       Receive DWA          Pending = FALSE
%%                                      Throwaway()          INITIAL
%%   INITIAL       Receive non-DWA      Throwaway()          INITIAL

rcv('DWA', #watchdog{status = initial} = S) ->
    throwaway(S#watchdog{pending = false});

rcv(_, #watchdog{status = initial} = S) ->
    throwaway(S);

%%   DOWN          Receive DWA          Pending = FALSE
%%                                      Throwaway()          DOWN
%%   DOWN          Receive non-DWA      Throwaway()          DOWN

rcv('DWA', #watchdog{status = down} = S) ->
    throwaway(S#watchdog{pending = false});

rcv(_, #watchdog{status = down} = S) ->
    throwaway(S);

%%   OKAY          Receive DWA          Pending = FALSE
%%                                      SetWatchdog()        OKAY
%%   OKAY          Receive non-DWA      SetWatchdog()        OKAY

rcv('DWA', #watchdog{status = okay} = S) ->
    set_watchdog(S#watchdog{pending = false});

rcv(_, #watchdog{status = okay} = S) ->
    set_watchdog(S);

%%   SUSPECT       Receive DWA          Pending = FALSE
%%                                      Failback()
%%                                      SetWatchdog()        OKAY
%%   SUSPECT       Receive non-DWA      Failback()
%%                                      SetWatchdog()        OKAY

rcv('DWA', #watchdog{status = suspect} = S) ->
    set_watchdog(S#watchdog{status = okay,
                            pending = false});

rcv(_, #watchdog{status = suspect} = S) ->
    set_watchdog(S#watchdog{status = okay});

%%   REOPEN        Receive DWA &        Pending = FALSE
%%                 NumDWA == 2          NumDWA++
%%                                      Failback()           OKAY

rcv('DWA', #watchdog{status = reopen,
                     num_dwa = 2 = N}
           = S) ->
    S#watchdog{status = okay,
               num_dwa = N+1,
               pending = false};

%%   REOPEN        Receive DWA &        Pending = FALSE
%%                 NumDWA < 2           NumDWA++             REOPEN

rcv('DWA', #watchdog{status = reopen,
                     num_dwa = N}
           = S) ->
    S#watchdog{num_dwa = N+1,
               pending = false};

%%   REOPEN        Receive non-DWA      Throwaway()          REOPEN

rcv(_, #watchdog{status = reopen} = S) ->
    throwaway(S).

%% timeout/1
%%
%% The caller sets the watchdog on the return value.

%%   OKAY          Timer expires &      SendWatchdog()
%%                 !Pending             SetWatchdog()
%%                                      Pending = TRUE       OKAY
%%   REOPEN        Timer expires &      SendWatchdog()
%%                 !Pending             SetWatchdog()
%%                                      Pending = TRUE       REOPEN

timeout(#watchdog{status = T,
                  pending = false}
        = S)
  when T == okay;
       T == reopen ->
    send_watchdog(S);

%%   OKAY          Timer expires &      Failover()
%%                 Pending              SetWatchdog()        SUSPECT

timeout(#watchdog{status = okay,
                  pending = true}
  = S) ->
    S#watchdog{status = suspect};

%%   SUSPECT       Timer expires        CloseConnection()
%%                                      SetWatchdog()        DOWN
%%   REOPEN        Timer expires &      CloseConnection()
%%                 Pending &            SetWatchdog()
%%                 NumDWA < 0                                DOWN

timeout(#watchdog{status = T,
                  pending = P,
                  num_dwa = N,
                  transport = TPid}
        = S)
  when T == suspect;
       T == reopen, P, N < 0 ->
    exit(TPid, {shutdown, watchdog_timeout}),
    S#watchdog{status = down};

%%   REOPEN        Timer expires &      NumDWA = -1
%%                 Pending &            SetWatchdog()
%%                 NumDWA >= 0                               REOPEN

timeout(#watchdog{status = reopen,
                  pending = true,
                  num_dwa = N}
        = S)
  when 0 =< N ->
    S#watchdog{num_dwa = -1};

%%   DOWN          Timer expires        AttemptOpen()
%%                                      SetWatchdog()        DOWN
%%   INITIAL       Timer expires        AttemptOpen()
%%                                      SetWatchdog()        INITIAL

%% RFC 3539, 3.4.1:
%%
%%   [5] While the connection is in the closed state, the AAA client MUST
%%       NOT attempt to send further watchdog messages on the connection.
%%       However, after the connection is closed, the AAA client continues
%%       to periodically attempt to reopen the connection.
%%
%%       The AAA client SHOULD wait for the transport layer to report
%%       connection failure before attempting again, but MAY choose to
%%       bound this wait time by the watchdog interval, Tw.

%% Don't bound, restarting the peer process only when the previous
%% process has died. We only need to handle state down since we start
%% the first watchdog when transitioning out of initial.

timeout(#watchdog{status = T} = S)
  when T == initial;
       T == down ->
    restart(S).

%% restart/1

restart(#watchdog{transport = undefined} = S) ->
    restart(getr(restart), S);
restart(S) ->
    S.

%% restart/2
%%
%% Only restart the transport in the connecting case. For an accepting
%% transport, there's no guarantee that an accepted connection in a
%% restarted transport if from the peer we've lost contact with so
%% have to be prepared for another watchdog to handle it. This is what
%% the diameter_reg registration in this module is for: the peer
%% connection is registered when leaving state initial and this is
%% used by a new accepting watchdog to realize that it's actually in
%% state down rather then initial when receiving notification of an
%% open connection.

restart({{connect, _} = T, Opts, Svc},
        #watchdog{parent = Pid,
                  sequence = Mask,
                  restrict = {R,_},
                  dictionary = Dict0}
        = S) ->
    send(Pid, {reconnect, self()}),
    Nodes = restrict_nodes(R),
    S#watchdog{transport = start(T, Opts, Mask, Nodes, Dict0, Svc),
               restrict = {R, lists:member(node(), Nodes)}};

%% No restriction on the number of connections to the same peer: just
%% die. Note that a state machine never enters state REOPEN in this
%% case.
restart({{accept, _}, _, _}, #watchdog{restrict = {_, false}}) ->
    stop;

%% Otherwise hang around until told to die.
restart({{accept, _}, _, _}, S) ->
    S.

%% Don't currently use Opts/Svc in the accept case.

%% dwr/1

dwr(#diameter_caps{origin_host = OH,
                   origin_realm = OR,
                   origin_state_id = OSI}) ->
    ['DWR', {'Origin-Host', OH},
            {'Origin-Realm', OR},
            {'Origin-State-Id', OSI}].

%% restrict_nodes/1

restrict_nodes(false) ->
    [];

restrict_nodes(nodes) ->
    [node() | nodes()];

restrict_nodes(node) ->
    [node()];

restrict_nodes(Nodes)
  when [] == Nodes;
       is_atom(hd(Nodes)) ->
    Nodes;

restrict_nodes(F) ->
    diameter_lib:eval(F).
-------------- next part --------------
A non-text attachment was scrubbed...
Name: smime.p7s
Type: application/pkcs7-signature
Size: 2182 bytes
Desc: Kryptograficzna sygnatura S/MIME
URL: <http://erlang.org/pipermail/erlang-bugs/attachments/20130520/44c7c439/attachment.bin>


More information about the erlang-bugs mailing list