[erlang-bugs] net_kernel hang, perhaps blocked by busy_dist_port race?
Scott Lystig Fritchie
fritchie@REDACTED
Fri May 21 17:44:06 CEST 2010
Hans Bolinder <hans.bolinder@REDACTED> wrote:
>> New update: recipe to duplicate.
hb> Great work. Much appreciated!
hb> We've been able to reproduce the scenario you describe.
Cool.
Attached is another idea for a fix. Instead of a VM fix, it patches
net_kernel.erl to avoid direct replies by the 'net_kernel' process.
It's perhaps better by not mucking with the VM, perhaps worse because it
isn't clear if the same port blocking + process unscheduling for other
processes such as 'global_group' could cause similar problems?
-Scott
-------------- next part --------------
--- /usr/local/src/erlang/otp_src_R13B04/lib/kernel/src/net_kernel.erl.orig 2009-11-20 07:29:33.000000000 -0600
+++ ./net_kernel.erl 2010-05-20 18:21:34.000000000 -0500
@@ -354,13 +354,13 @@
%% The response is delayed until the connection is up and
%% running.
%%
-handle_call({connect, _, Node}, _From, State) when Node =:= node() ->
- {reply, true, State};
+handle_call({connect, _, Node}, From, State) when Node =:= node() ->
+ async_reply({reply, true, State}, From);
handle_call({connect, Type, Node}, From, State) ->
verbose({connect, Type, Node}, 1, State),
case ets:lookup(sys_dist, Node) of
[Conn] when Conn#connection.state =:= up ->
- {reply, true, State};
+ async_reply({reply, true, State}, From);
[Conn] when Conn#connection.state =:= pending ->
Waiting = Conn#connection.waiting,
ets:insert(sys_dist, Conn#connection{waiting = [From|Waiting]}),
@@ -376,19 +376,19 @@
{noreply,State#state{conn_owners=Owners}};
_ ->
?connect_failure(Node, {setup_call, failed}),
- {reply, false, State}
+ async_reply({reply, false, State}, From)
end
end;
%%
%% Close the connection to Node.
%%
-handle_call({disconnect, Node}, _From, State) when Node =:= node() ->
- {reply, false, State};
-handle_call({disconnect, Node}, _From, State) ->
+handle_call({disconnect, Node}, From, State) when Node =:= node() ->
+ async_reply({reply, false, State}, From);
+handle_call({disconnect, Node}, From, State) ->
verbose({disconnect, Node}, 1, State),
{Reply, State1} = do_disconnect(Node, State),
- {reply, Reply, State1};
+ async_reply({reply, Reply, State1}, From);
%%
%% The spawn/4 BIF ends up here.
@@ -411,39 +411,39 @@
%%
%% Only allow certain nodes.
%%
-handle_call({allow, Nodes}, _From, State) ->
+handle_call({allow, Nodes}, From, State) ->
case all_atoms(Nodes) of
true ->
Allowed = State#state.allowed,
- {reply,ok,State#state{allowed = Allowed ++ Nodes}};
+ async_reply({reply,ok,State#state{allowed = Allowed ++ Nodes}}, From);
false ->
- {reply,error,State}
+ async_reply({reply,error,State}, From)
end;
%%
%% authentication, used by auth. Simply works as this:
%% if the message comes through, the other node IS authorized.
%%
-handle_call({is_auth, _Node}, _From, State) ->
- {reply,yes,State};
+handle_call({is_auth, _Node}, From, State) ->
+ async_reply({reply,yes,State}, From);
%%
%% Not applicable any longer !?
%%
handle_call({apply,_Mod,_Fun,_Args}, {From,Tag}, State)
when is_pid(From), node(From) =:= node() ->
- gen_server:reply({From,Tag}, not_implemented),
+ async_gen_server_reply({From,Tag}, not_implemented),
% Port = State#state.port,
% catch apply(Mod,Fun,[Port|Args]),
{noreply,State};
-handle_call(longnames, _From, State) ->
- {reply, get(longnames), State};
+handle_call(longnames, From, State) ->
+ async_reply({reply, get(longnames), State}, From);
-handle_call({update_publish_nodes, Ns}, _From, State) ->
- {reply, ok, State#state{publish_on_nodes = Ns}};
+handle_call({update_publish_nodes, Ns}, From, State) ->
+ async_reply({reply, ok, State#state{publish_on_nodes = Ns}}, From);
-handle_call({publish_on_node, Node}, _From, State) ->
+handle_call({publish_on_node, Node}, From, State) ->
NewState = case State#state.publish_on_nodes of
undefined ->
State#state{publish_on_nodes =
@@ -457,11 +457,11 @@
Nodes ->
lists:member(Node, Nodes)
end,
- {reply, Publish, NewState};
+ async_reply({reply, Publish, NewState}, From);
-handle_call({verbose, Level}, _From, State) ->
- {reply, State#state.verbose, State#state{verbose = Level}};
+handle_call({verbose, Level}, From, State) ->
+ async_reply({reply, State#state.verbose, State#state{verbose = Level}}, From);
%%
%% Set new ticktime
@@ -471,16 +471,16 @@
%% #tick_change{} record if the ticker process has been upgraded;
%% otherwise, an integer or an atom.
-handle_call(ticktime, _, #state{tick = #tick{time = T}} = State) ->
- {reply, T, State};
-handle_call(ticktime, _, #state{tick = #tick_change{time = T}} = State) ->
- {reply, {ongoing_change_to, T}, State};
+handle_call(ticktime, From, #state{tick = #tick{time = T}} = State) ->
+ async_reply({reply, T, State}, From);
+handle_call(ticktime, From, #state{tick = #tick_change{time = T}} = State) ->
+ async_reply({reply, {ongoing_change_to, T}, State}, From);
-handle_call({new_ticktime,T,_TP}, _, #state{tick = #tick{time = T}} = State) ->
+handle_call({new_ticktime,T,_TP}, From, #state{tick = #tick{time = T}} = State) ->
?tckr_dbg(no_tick_change),
- {reply, unchanged, State};
+ async_reply({reply, unchanged, State}, From);
-handle_call({new_ticktime,T,TP}, _, #state{tick = #tick{ticker = Tckr,
+handle_call({new_ticktime,T,TP}, From, #state{tick = #tick{ticker = Tckr,
time = OT}} = State) ->
?tckr_dbg(initiating_tick_change),
start_aux_ticker(T, OT, TP),
@@ -493,14 +493,14 @@
?tckr_dbg(shorter_ticktime),
shorter
end,
- {reply, change_initiated, State#state{tick = #tick_change{ticker = Tckr,
+ async_reply({reply, change_initiated, State#state{tick = #tick_change{ticker = Tckr,
time = T,
- how = How}}};
+ how = How}}}, From);
-handle_call({new_ticktime,_,_},
+handle_call({new_ticktime,From,_},
_,
#state{tick = #tick_change{time = T}} = State) ->
- {reply, {ongoing_change_to, T}, State}.
+ async_reply({reply, {ongoing_change_to, T}, State}, From).
%% ------------------------------------------------------------
%% handle_cast.
@@ -1079,11 +1079,11 @@
spawn_func(link,{From,Tag},M,F,A,Gleader) ->
link(From),
- gen_server:reply({From,Tag},self()), %% ahhh
+ async_gen_server_reply({From,Tag},self()), %% ahhh
group_leader(Gleader,self()),
apply(M,F,A);
spawn_func(_,{From,Tag},M,F,A,Gleader) ->
- gen_server:reply({From,Tag},self()), %% ahhh
+ async_gen_server_reply({From,Tag},self()), %% ahhh
group_leader(Gleader,self()),
apply(M,F,A).
@@ -1409,7 +1409,7 @@
reply_waiting1(lists:reverse(Waiting), Rep).
reply_waiting1([From|W], Rep) ->
- gen_server:reply(From, Rep),
+ async_gen_server_reply(From, Rep),
reply_waiting1(W, Rep);
reply_waiting1([], _) ->
ok.
@@ -1511,3 +1511,10 @@
getnode(P) when is_pid(P) -> node(P);
getnode(P) -> P.
+
+async_reply({reply, Msg, State}, From) ->
+ async_gen_server_reply(From, Msg),
+ {noreply, State}.
+
+async_gen_server_reply(From, Msg) ->
+ spawn(fun() -> gen_server:reply(From, Msg) end).
More information about the erlang-bugs
mailing list