[erlang-bugs] net_kernel hang, perhaps blocked by busy_dist_port race?

Scott Lystig Fritchie fritchie@REDACTED
Fri May 21 17:44:06 CEST 2010


Hans Bolinder <hans.bolinder@REDACTED> wrote:

>> New update: recipe to duplicate.

hb> Great work. Much appreciated!

hb> We've been able to reproduce the scenario you describe.

Cool.

Attached is another idea for a fix.  Instead of a VM fix, it patches
net_kernel.erl to avoid direct replies by the 'net_kernel' process.
It's perhaps better by not mucking with the VM, perhaps worse because it
isn't clear if the same port blocking + process unscheduling for other
processes such as 'global_group' could cause similar problems?

-Scott
-------------- next part --------------
--- /usr/local/src/erlang/otp_src_R13B04/lib/kernel/src/net_kernel.erl.orig	2009-11-20 07:29:33.000000000 -0600
+++ ./net_kernel.erl	2010-05-20 18:21:34.000000000 -0500
@@ -354,13 +354,13 @@
 %% The response is delayed until the connection is up and
 %% running.
 %%
-handle_call({connect, _, Node}, _From, State) when Node =:= node() ->
-    {reply, true, State};
+handle_call({connect, _, Node}, From, State) when Node =:= node() ->
+    async_reply({reply, true, State}, From);
 handle_call({connect, Type, Node}, From, State) ->
     verbose({connect, Type, Node}, 1, State),
     case ets:lookup(sys_dist, Node) of
 	[Conn] when Conn#connection.state =:= up ->
-	    {reply, true, State};
+	    async_reply({reply, true, State}, From);
 	[Conn] when Conn#connection.state =:= pending ->
 	    Waiting = Conn#connection.waiting,
 	    ets:insert(sys_dist, Conn#connection{waiting = [From|Waiting]}),
@@ -376,19 +376,19 @@
 		    {noreply,State#state{conn_owners=Owners}};
 		_  ->
 		    ?connect_failure(Node, {setup_call, failed}),
-		    {reply, false, State}
+		    async_reply({reply, false, State}, From)
 	    end
     end;
 
 %%
 %% Close the connection to Node.
 %%
-handle_call({disconnect, Node}, _From, State) when Node =:= node() ->
-    {reply, false, State};
-handle_call({disconnect, Node}, _From, State) ->
+handle_call({disconnect, Node}, From, State) when Node =:= node() ->
+    async_reply({reply, false, State}, From);
+handle_call({disconnect, Node}, From, State) ->
     verbose({disconnect, Node}, 1, State),
     {Reply, State1} = do_disconnect(Node, State),
-    {reply, Reply, State1};
+    async_reply({reply, Reply, State1}, From);
 
 %% 
 %% The spawn/4 BIF ends up here.
@@ -411,39 +411,39 @@
 %% 
 %% Only allow certain nodes.
 %% 
-handle_call({allow, Nodes}, _From, State) ->
+handle_call({allow, Nodes}, From, State) ->
     case all_atoms(Nodes) of
 	true ->
 	    Allowed = State#state.allowed,
-	    {reply,ok,State#state{allowed = Allowed ++ Nodes}};  
+	    async_reply({reply,ok,State#state{allowed = Allowed ++ Nodes}}, From);  
 	false ->
-	    {reply,error,State}
+	    async_reply({reply,error,State}, From)
     end;
 
 %% 
 %% authentication, used by auth. Simply works as this:
 %% if the message comes through, the other node IS authorized.
 %% 
-handle_call({is_auth, _Node}, _From, State) ->
-    {reply,yes,State};
+handle_call({is_auth, _Node}, From, State) ->
+    async_reply({reply,yes,State}, From);
 
 %% 
 %% Not applicable any longer !?
 %% 
 handle_call({apply,_Mod,_Fun,_Args}, {From,Tag}, State) 
   when is_pid(From), node(From) =:= node() ->
-    gen_server:reply({From,Tag}, not_implemented),
+    async_gen_server_reply({From,Tag}, not_implemented),
 %    Port = State#state.port,
 %    catch apply(Mod,Fun,[Port|Args]),
     {noreply,State};
 
-handle_call(longnames, _From, State) ->
-    {reply, get(longnames), State};
+handle_call(longnames, From, State) ->
+    async_reply({reply, get(longnames), State}, From);
 
-handle_call({update_publish_nodes, Ns}, _From, State) ->
-    {reply, ok, State#state{publish_on_nodes = Ns}};
+handle_call({update_publish_nodes, Ns}, From, State) ->
+    async_reply({reply, ok, State#state{publish_on_nodes = Ns}}, From);
 
-handle_call({publish_on_node, Node}, _From, State) ->
+handle_call({publish_on_node, Node}, From, State) ->
     NewState = case State#state.publish_on_nodes of
 		   undefined ->
 		       State#state{publish_on_nodes =
@@ -457,11 +457,11 @@
 		  Nodes ->
 		      lists:member(Node, Nodes)
 	      end,
-    {reply, Publish, NewState};
+    async_reply({reply, Publish, NewState}, From);
 
 
-handle_call({verbose, Level}, _From, State) ->
-    {reply, State#state.verbose, State#state{verbose = Level}};
+handle_call({verbose, Level}, From, State) ->
+    async_reply({reply, State#state.verbose, State#state{verbose = Level}}, From);
 
 %%
 %% Set new ticktime
@@ -471,16 +471,16 @@
 %% #tick_change{} record if the ticker process has been upgraded;
 %% otherwise, an integer or an atom.
 
-handle_call(ticktime, _, #state{tick = #tick{time = T}} = State) ->
-    {reply, T, State};
-handle_call(ticktime, _, #state{tick = #tick_change{time = T}} = State) ->
-    {reply, {ongoing_change_to, T}, State};
+handle_call(ticktime, From, #state{tick = #tick{time = T}} = State) ->
+    async_reply({reply, T, State}, From);
+handle_call(ticktime, From, #state{tick = #tick_change{time = T}} = State) ->
+    async_reply({reply, {ongoing_change_to, T}, State}, From);
 
-handle_call({new_ticktime,T,_TP}, _, #state{tick = #tick{time = T}} = State) ->
+handle_call({new_ticktime,T,_TP}, From, #state{tick = #tick{time = T}} = State) ->
     ?tckr_dbg(no_tick_change),
-    {reply, unchanged, State};
+    async_reply({reply, unchanged, State}, From);
 
-handle_call({new_ticktime,T,TP}, _, #state{tick = #tick{ticker = Tckr,
+handle_call({new_ticktime,T,TP}, From, #state{tick = #tick{ticker = Tckr,
 							time = OT}} = State) ->
     ?tckr_dbg(initiating_tick_change),
     start_aux_ticker(T, OT, TP),
@@ -493,14 +493,14 @@
 		  ?tckr_dbg(shorter_ticktime),
 		  shorter
 	  end,
-    {reply, change_initiated, State#state{tick = #tick_change{ticker = Tckr,
+    async_reply({reply, change_initiated, State#state{tick = #tick_change{ticker = Tckr,
 							      time = T,
-							      how = How}}};
+							      how = How}}}, From);
 
-handle_call({new_ticktime,_,_},
+handle_call({new_ticktime,From,_},
 	    _,
 	    #state{tick = #tick_change{time = T}} = State) ->
-    {reply, {ongoing_change_to, T}, State}.
+    async_reply({reply, {ongoing_change_to, T}, State}, From).
 
 %% ------------------------------------------------------------
 %% handle_cast.
@@ -1079,11 +1079,11 @@
 
 spawn_func(link,{From,Tag},M,F,A,Gleader) ->
     link(From),
-    gen_server:reply({From,Tag},self()),  %% ahhh
+    async_gen_server_reply({From,Tag},self()),  %% ahhh
     group_leader(Gleader,self()),
     apply(M,F,A);
 spawn_func(_,{From,Tag},M,F,A,Gleader) ->
-    gen_server:reply({From,Tag},self()),  %% ahhh
+    async_gen_server_reply({From,Tag},self()),  %% ahhh
     group_leader(Gleader,self()),
     apply(M,F,A).
 
@@ -1409,7 +1409,7 @@
     reply_waiting1(lists:reverse(Waiting), Rep).
 
 reply_waiting1([From|W], Rep) ->
-    gen_server:reply(From, Rep),
+    async_gen_server_reply(From, Rep),
     reply_waiting1(W, Rep);
 reply_waiting1([], _) ->
     ok.
@@ -1511,3 +1511,10 @@
 
 getnode(P) when is_pid(P) -> node(P);
 getnode(P) -> P.
+
+async_reply({reply, Msg, State}, From) ->
+    async_gen_server_reply(From, Msg),
+    {noreply, State}.
+
+async_gen_server_reply(From, Msg) ->
+    spawn(fun() -> gen_server:reply(From, Msg) end).


More information about the erlang-bugs mailing list