[erlang-questions] Diagnosing gen_server call timeouts

Roger Lipscombe roger@REDACTED
Fri Sep 20 23:49:30 CEST 2019


In the end, I used 'dbg'; here's the relevant snippet:

AssertElapsed =
    fun(Pid, Call, Req, Then) ->
        Now = erlang:monotonic_time(),
        ElapsedMicros = erlang:convert_time_unit(Now - Then, native,
microsecond),
        if
            ElapsedMicros > 5000 ->
                io:format("~s ~p(~p) took ~B us\n",
                    [lists:flatten(ec_date:format("c",
calendar:universal_time())), Call, Req, ElapsedMicros]);
            true ->
                ok
        end,
        {message_queue_len, QueueLen} = erlang:process_info(Pid,
message_queue_len),
        if
            QueueLen > 5 ->
                io:format("~s ~p\n",
[lists:flatten(ec_date:format("c", calendar:universal_time())),
erlang:process_info(Pid, messages)]);
            true ->
                ok
        end
    end.

%% TState :: map(Fun -> [{Time, Arg0}]).
TState0 = #{}.

TFun =
    fun(_Msg = {trace, _Pid, call, {Mod, Fun, [Arg0 | _]}}, TState) ->
        % push an entry timestamp.
        Call = {erlang:monotonic_time(), Arg0},
        maps:update_with(Fun, fun(Calls) -> [Call | Calls] end, [Call], TState);

       (_Msg = {trace, _Pid, return_from, {Mod, Fun, _Arity},
_Result}, TState) ->
        % pop entry timestamp.
        [{Then, Arg0} | Calls] = maps:get(Fun, TState),
        AssertElapsed(Pid, Fun, Arg0, Then),
        maps:update(Fun, Calls, TState);

       (Msg, TState) ->
        io:format("~p\n", [Msg]),
        TState
    end.

dbg:start().
dbg:tracer(process, {TFun, TState0}).
dbg:tpl(Mod, handle_call, '_', dbg:fun2ms(fun(_) -> return_trace() end)).
dbg:tpl(Mod, handle_info, '_', dbg:fun2ms(fun(_) -> return_trace() end)).
dbg:p(Pid, c).

In my particular case, handle_info was occasionally taking ~30
seconds, depending on the input, meaning that if a gen_server:call
occurred immediately afterwards, there was a good chance it would time
out.

Aside: I *always* forget that maps:update_with/4 doesn't apply Fun to Init.

On Thu, 19 Sep 2019 at 18:33, Roger Lipscombe <roger@REDACTED> wrote:
>
> I've got a gen_server:call that -- very occasionally -- suffers from a
> timeout. Obviously this means that my gen_server is already busy doing
> something else.
>
> Is there a way that I can instrument gen_server so that it will log
> something if it takes too long to return from a callback? That is: my
> handle_call is timing out because, presumably, there's another
> handle_call (or handle_info, etc.) that's blocking. I'd like *that* to
> be logged if it takes longer than, say, 200 milliseconds to return.
>
> Or, I guess, if the message queue length is excessive at any point.
>
> Caveat: I've got ~20,000 of these gen_server processes, and this only
> happens intermittently in production, so I'm thinking that tracing
> *isn't* the right answer here.



More information about the erlang-questions mailing list