[erlang-patches] Race condition in mnesia_locker after nodedown for sticky_locks

Fri Oct 4 08:52:19 CEST 2013

Hi OTP team,

We found found bug (race condition) in mnesia code regarding
disconnecting nodes owning sticky lock.
Our (simplified) setup consists of two nodes, when first node acts as
active one and second acts as standby.
It means that all requests are normally going through first node and
second node is only used after fail of the first. Mnesia in our
scenario is used as HA data store.
For this setup it seems that sticky locks are ideal solution - all
operations are performed only on one node.
Everything worked well, until we found that some times transactions
started just after disconnecting first node hangs indefinitely.
This bogus behaviour was very rare, so it seemed that there is some
race condition how sticky locks are handled after disconnection of
node.

We found that there is problem in synchronization of transaction
process (process initializing transaction) and mnesia_locker process.
It can happen that mnesia_locker process respond to lock request with
{switch, N, Req} message, but node N is already disconnected.

Part of message trace to show the problem:

List of interesting PIDs:

mnesia_controller="<0.93.0>"
mnesia_tm="<0.89.0>"
mnesia_monitor="<0.85.0>"
mnesia_locker="<0.87.0>"
mnesia_recover="<0.88.0>"
transaction process="<0.206.0>"

Message trace:

% mnesia_monitor lost track of mnesia_monitor at node 1
(<0.85.0>) << {'EXIT',<5184.92.0>,noconnection}

% mnesia_monitor receives nodedown - this message is ignored
(<0.85.0>) << {nodedown,'gcfc-triple1_519129@REDACTED'}

% mnesia_monitor call mnesia_recover:mnesia_down()
(<0.85.0>) <0.88.0> ! {'$gen_cast',
{mnesia_down,'gcfc-triple1_519129@REDACTED'}}

% mnesia_recover receives mnesia_down, nothing to do there
(<0.88.0>) << {'$gen_cast',{mnesia_down,'gcfc-triple1_519129@REDACTED'}}

% mnesia_monitor call mnesia_controller:mnesia_down
(<0.85.0>) <0.93.0> ! {'$gen_cast',
{mnesia_down,'gcfc-triple1_519129@REDACTED'}}

% mnesia_controller receives mnesia_down
(<0.93.0>) << {'$gen_cast',{mnesia_down,'gcfc-triple1_519129@REDACTED'}}

% mnesia_controller calls mnesia_recover:log_mnesia_down()
(<0.93.0>) <0.88.0> ! {'$gen_call', {<0.93.0>,#Ref<0.0.0.1860>},
{log_mnesia_down,'gcfc-triple1_519129@REDACTED'}}

% mnesia_recover receives log_mnesia_down, store it in decision table and reply
(<0.88.0>) << {'$gen_call',{<0.93.0>,#Ref<0.0.0.1860>},
{log_mnesia_down,'gcfc-triple1_519129@REDACTED'}}
(<0.88.0>) <0.93.0> ! {#Ref<0.0.0.1860>,ok}

% mnesia_controller receives response from mnesia_recover:log_mnesia_down()
(<0.93.0>) << {#Ref<0.0.0.1860>,ok}

% mnesia_controller calls mnesia_monitor:mnesia_down(mnesia_controller, Node)
(<0.93.0>) <0.85.0> ! {'$gen_cast', {mnesia_down,mnesia_controller,
'gcfc-triple1_519129@REDACTED'}}

% mnesia_controller receives mnesia_down, mnesia_controller
(<0.85.0>) << {'$gen_cast', {mnesia_down,mnesia_controller,
'gcfc-triple1_519129@REDACTED'}}

% mnesia_controller forwards mnesia_down to mnesia_tm
(<0.85.0>) <0.89.0> ! {mnesia_down,'gcfc-triple1_519129@REDACTED'}

% mnesia_tm receives mnesia_down
(<0.89.0>) << {mnesia_down,'gcfc-triple1_519129@REDACTED'}

% mnesia_tm calls mnesia_monitor:mnesia_down(mnesia_tm, {Node,
ParticipantsTids}),
(<0.89.0>) <0.85.0> ! {'$gen_cast', {mnesia_down,mnesia_tm,
{'gcfc-triple1_519129@REDACTED',[]}}}

% mnesia_monitor receives mnesia_down, mnesia_tm
% now it is going to call mnesia_locker:mnesia_down, which will send
release_remote_non_pending
(<0.85.0>) << {'$gen_cast', {mnesia_down, mnesia_tm,
{'gcfc-triple1_519129@REDACTED',[]}}}

% Our processes receives information about node down and try to
recover from mnesiai
(<0.206.0>) << {activate,true}

% Starts outer transaction
(<0.206.0>) <0.89.0> ! {{<0.206.0>,#Ref<0.0.0.1875>},start_outer}
(<0.89.0>) << {{<0.206.0>,#Ref<0.0.0.1875>},start_outer}

% Reply for start_outer
(<0.89.0>) <0.206.0> ! {mnesia_tm,#Ref<0.0.0.1875>,
{new_tid,{tid,176,<0.206.0>},245835}}
(<0.206.0>) << {mnesia_tm,#Ref<0.0.0.1875>,
{new_tid,{tid,176,<0.206.0>},245835}}

% Try to obtain lock for mnesia:select (just failed node holded sticky
lock for that table)
(<0.206.0>) {mnesia_locker,'gcfc-triple2_519129@REDACTED'} !
{<0.206.0>, {read, {tid,176, <0.206.0>},
{'tasks_tt_gcfc-triple1_519129@REDACTED',
'______WHOLETABLE_____'}}}
(<0.87.0>) << {<0.206.0>, {read, {tid,176,<0.206.0>},
{'tasks_tt_gcfc-triple1_519129@REDACTED',
'______WHOLETABLE_____'}}}

% mnesia_locker informs out process that it needs switch sticky lock
(<0.87.0>) <0.206.0> ! {mnesia_locker,'gcfc-triple2_519129@REDACTED',
{switch,'gcfc-triple1_519129@REDACTED', {<0.206.0>, {read,
{tid,176,<0.206.0>},
{'tasks_tt_gcfc-triple1_519129@REDACTED',
'______WHOLETABLE_____'}}}}}
(<0.206.0>) << {mnesia_locker,'gcfc-triple2_519129@REDACTED',
{switch,'gcfc-triple1_519129@REDACTED', {<0.206.0>, {read,
{tid,176,<0.206.0>},
{'tasks_tt_gcfc-triple1_519129@REDACTED',
'______WHOLETABLE_____'}}}}}

% Our proces just want to send {<0.206.0>, {read, {tid,176,<0.206.0>},
{'tasks_tt_gcfc-triple1_519129@REDACTED',
'______WHOLETABLE_____'}}} to other node
% Other node is not connected, so Erlang tryies to connect to it
(<0.21.0> is net_kernel process)
(<0.206.0>) <0.21.0> ! {'$gen_call', {<0.206.0>,#Ref<0.0.0.1884>},
{connect,normal,'gcfc-triple1_519129@REDACTED'}}

% Connection to remote node was unsucsefull, and our process is hanged
in this function:
%
% l_req_rec(Node, Store) ->
%     ?ets_insert(Store, {nodes, Node}),
%     receive
%         {?MODULE, Node, Reply} ->
%         Reply;
%     {mnesia_down, Node} ->
%         {not_granted, {node_not_running, Node}}
% end.
%
% It will never get out, as sent message is lost
(<0.206.0>) << {#Ref<0.0.0.1884>,false}

% mnesia_monitor call mnesia_locker:mnesia_down, which is sending
release_remote_non_pending
(<0.85.0>) <0.87.0> ! {release_remote_non_pending,
'gcfc-triple1_519129@REDACTED',[]}

% mnesia_locker receives release_remote_non_pending
% it removes sticky locks of failed nodes and releases all transactions
% but it is too late - process <0.206.0> is already stucked for
waiting for response from not connected node
(<0.87.0>) << {release_remote_non_pending,'gcfc-triple1_519129@REDACTED',[]}

% mnesia_locker call mnesia_monitor:mnesia_down(mnesia_locker, Node)
(<0.87.0>) <0.85.0> ! {'$gen_cast', {mnesia_down,mnesia_locker,
'gcfc-triple1_519129@REDACTED'}}

% mnesia_monitor receives mnesia_down, mnesia_locker
(<0.85.0>) << {'$gen_cast',
{mnesia_down,mnesia_locker,'gcfc-triple1_519129@REDACTED'}}

% mnesia_monitor calls mnesia_lib:report_system_event, which will send
notify to mnesia_event
(<0.85.0>) <0.83.0> ! {notify, {mnesia_system_event,
{mnesia_down,'gcfc-triple1_519129@REDACTED'}}}

So problem is that transaction process sends messages to not-connected
node and awaits response which will never come.
One of the easiest solution is to monitor remote process, which
prevent this from happening.

Suggested patch is available at:

git fetch git://github.com/janchochol/otp.git sticky_lock_race

https://github.com/janchochol/otp/compare/erlang:maint...sticky_lock_race
https://github.com/janchochol/otp/compare/erlang:maint...sticky_lock_race.patch

You can try to reproduce the problem with following module:

--- BEGIN OF slock.erl ---
%% stick_lock problem reproducer
%%
%% To reproduce problem execute these commands in parallel:
%%
%% erl -sname n2@REDACTED -cookie slock -s slock n2
%% erl -sname n1@REDACTED -cookie slock -s slock n1
%%
%% Problem is timing related, so it has relatively low probability of
reproducing.
%% To increase probability use this patch:
%%
%% --- a/lib/mnesia/src/mnesia_locker.erl
%% +++ b/lib/mnesia/src/mnesia_locker.erl
%% @@ -663,6 +663,7 @@ mnesia_down(N, Pending) ->
%%             %% race with mnesia_tm's coordinator processes
%%             %% that may restart and acquire new locks.
%%             %% mnesia_monitor ensures the sync.
%% +           timer:sleep(1000),
%%             Pid ! {release_remote_non_pending, N, Pending}
%%      end.
%%
%%
%% When problem appears n1 node will hang indefinitely.
%% When problem is not present n1 will finish shortly after n2.

-module(slock).

-export([n1/0, n2/0]).

wait_for_n2() ->
    case rpc:call(n2@REDACTED, erlang, whereis, [n2]) of
        Pid when is_pid(Pid) ->
            Pid;
        _ ->
            timer:sleep(100),
            wait_for_n2()
    end.

wait_fun(Fun, PollIntervalMs) ->
    case Fun() of
        false ->
            timer:sleep(PollIntervalMs),
            wait_fun(Fun, PollIntervalMs);
        true ->
            ok
    end.

n1() ->
    register(n1, self()),
    io:format("n1: statred~n", []),
    io:format("n1: waiting for n2~n", []),
    N2 = wait_for_n2(),
    io:format("n1: n2 found~n", []),
    io:format("n1: init mnesia start~n", []),
    ok = application:start(mnesia, permanent),
    ok = rpc:call(n2@REDACTED, application, start, [mnesia, permanent]),
    {ok, [_ | _]} = mnesia:change_config(extra_db_nodes, [n2@REDACTED]),
    {atomic, ok} = mnesia:create_table(t1, [{ram_copies,
[n1@REDACTED, n2@REDACTED]}]),
    ok = rpc:call(n2@REDACTED, mnesia, wait_for_tables, [[t1], infinity]),
    io:format("n1: init mnesia end~n", []),
    N2 ! {ping, self()},
    receive step -> ok end,
    wait_fun(fun() -> net_adm:ping(n2@REDACTED) =:= pang end, 100),
    io:format("n1: select start~n", []),
    Ret = mnesia:transaction(fun() ->
                wait_fun(fun() -> net_adm:ping(n2@REDACTED) =:= pang end, 10),
                mnesia:select(t1, [{{t1, '_', '_'}, [], ['$_']}])
        end),
    io:format("n1: select end: ~p~n", [Ret]),
    init:stop().

n2() ->
    register(n2, self()),
    io:format("n2: statred~n", []),
    io:format("n2: waiting for n1 ping~n", []),
    N2 = receive
        {ping, IN2} ->
            IN2
    end,
    io:format("n2: n1 pinged~n", []),
    io:format("n2: write start~n", []),
    mnesia:transaction(fun() -> mnesia:write(t1, {t1, a, b}, sticky_write) end),
    io:format("n2: write end~n", []),
    N2 ! step,
    io:format("n2: finish~n", []),
    erlang:halt().
--- END OF slock.erl ---

I do not see any other easy solution for this problem.

Best Regards,
Jan Chochol