optimization tricks ?

Scott Lystig Fritchie scott@REDACTED
Mon May 15 22:48:31 CEST 2000

>>>>> "fc" == Francesco Cesarini <cesarini@REDACTED> writes:

fc> Erlang methodology:

Since most worthwhile performance increases come from algorithmic
changes, I'd have to agree with Francesco's methodology.  Discussion
on the list about whether to go for "beauty" first is just a minor
problem of choosing a decent algorithm ... for some value of decent.

Once you've decided that you need more speed, and algorithmic changes
have been exhausted (or are impractical), it's worth targeting your
optimization efforts.  The Erlang profiling tools are weak, but they
can help prevent you from optimizing code that has a minimal effect.
Several times, profiling has saved me from doing work: the code that I
thought was a performance problem really wasn't.  Being a lazy
performance optimizer is almost always a good thing.  Look for the low
hanging fruit first.

The "eprof" module in the Erlang distribution was giving me headaches
(as discussed in this forum a few months back).  Ulf Wiger, I believe,
told me to do what his team ended up doing when facing the same
problem: write my own profiler.  I chose instead to hack "eprof".  The
result isn't pretty, but it does a much better job than "eprof" does.

It still isn't as accurate as it should be, but it doesn't seem to be
my fault.  The trace messages generated by the BEAM VM don't always
cooperate.  I've seen cases where a gen_server callback looking
something like (pulling from memory):

	handle_call({foo, Bar, Baz}, From, State) ->
	    Reply = call_real_foo_func(Bar, Baz, State),
	    {reply, Reply, State);

	call_real_foo_func(A, B, S) when tuple(A) ->
	    do_foo(A, B, S);
	call_real_foo_func(A, B, S) when atom(A) ->
	    do_some_other_foo(A, B, S).

... would sometimes only tally one call to call_real_foo_func() when I
*know* it's being called hundreds of times via handle_call().  {shrug}
Given the rumors that R7 is going to have a new and improved profiler,
I haven't worried much about it because it's infrequent.  (If anyone
finds a bug in the code below, feel free to fix it and lemme know.  :-)

At any rate, the eprof-derived profiler code, dubbed slf_eprof.erl, is
included below.


P.S.  Oh, one other thing.  We learned a fair amount about our
application by enabling profiling when compiling the BEAM interpreter.
It's kindof a 10,000 meter overview of how things work, but it was
worth doing.  Add "-pg" to the "TYPE_FLAGS" line in
erts/emulator/Makefile.in, then a "./configure" at the top, then
compile, run your app, then analyze with "gprof".

P.P.S.  Eprof and slf_eprof are both really annoying because they
don't maintain call graphs.  (Difficult, perhaps impossible, with the
current tracing messages.)  Hopefully the new profiler will be able to
track a function's time and the time of its descendents.

--- snip --- snip --- snip --- snip --- snip --- snip --- snip --- snip --- 

%% ``The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved via the world wide web at http://www.erlang.org/.
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%% The Initial Developer of the Original Code is Ericsson Utvecklings AB.
%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings
%% AB. All Rights Reserved.''
%%     $Id: slf_eprof.erl,v 1.5 2000/03/28 00:11:10 scott Exp $
%% Purpose: Profile a system in order to figure out where the 
%% time goes.
%% Hacked by Scott Lystig Fritchie, <scott@REDACTED>.  Based in large
%% part on eprof.erl from the R6B distro ... and is *much* more accurate
%% than the original.  AFAIK, its errors aren't due to significant bugs
%% in this module but are rather caused by quirks from the trace messages
%% generated by the current BEAM VM.  Bug fixes and additions are welcome.
%% Rumor has it there's a new profiler in the works for R7.  Hopefully
%% this hacked eprof will not be necessary when R7 is released.  In
%% the meantime, feel free to use this module for whatever purpose
%% within the license's constraints.
%% New start() syntax.  start(wall_clock) measures wall-clock time, where
%% time spent swapped out will be counted against the function being traced.
%% start(swapped) will explictly measure the amount of time spent swapped out.
%% start() will default to start(swapped).
%% NOTE: Use of "wall_clock" is not recommended.  Caveat user.
%% It would be *really* cool if we could tell the difference between being
%% swapped out because our quantum was spent or because we we blocked
%% trying to "receive" on a mailbox that didn't have a matching message.
%% Example usage:	slf_eprof:start(swapped|wall_clock).


-define(NAME, slf_eprof).
-define(GARBAGE_FUNC, {erlang, garbage_collection, 0}).

-export([start/0, start/1, stop/0, dump/0, total_analyse/0,
	 start_profiling/1, profile/4, profile/1,
	 stop_profiling/0, analyse/0, log/1]).
-export([keep_history/1, get_history/0]).

%% Internal exports 

-import(lists, [flatten/1,reverse/1,keysort/2, member/2,keysearch/3]).

-record(state, {table = notable,
		method = void,
		proc = noproc, 
		profiling = false, 
		pfunc = nofunc,
		ptime = 0,
		acktime = 0,
		pop = running,
		keephistory = false,
		history = [],
		rootset = []}).


start() ->
start(Method) when atom(Method) ->
	Method == wall_clock; Method == swapped ->
	    gen_server:start({local, ?NAME}, ?NAME, [Method], []);
	true ->
	    io:format("usage: start(wall_clock | swapped)\n"),
	    {error, usage}
stop()  -> gen_server:call(?NAME, stop, infinity).

profile(Pids,M,F,A) ->
    gen_server:call(?NAME, {profile,Pids, M,F,A},infinity).

dump() -> 
    gen_server:call(?NAME, dump, infinity).

analyse() ->
    gen_server:call(?NAME, analyse, infinity).

log(File) ->
    gen_server:call(?NAME, {logfile, File}, infinity).

total_analyse() ->
    gen_server:call(?NAME, total_analyse, infinity).

start_profiling(Rootset) ->
    gen_server:call(?NAME, {profile, Rootset}, infinity).

stop_profiling() ->
    gen_server:call(?NAME, stop_profiling, infinity).

profile(Rs) ->

keep_history(Value) ->				% true | false
    gen_server:call(?NAME, {keep_history, Value}, infinity).

get_history() ->
    gen_server:call(?NAME, get_history, infinity).


init([Method]) ->
    process_flag(trap_exit, true),
    process_flag(priority, max), 
    put(oct, onecalltime()),			% The time erlang:time() takes.
    put(sched_time, sched_time()),
    {ok, #state{method = Method}}.

subtr({X1,Y1,Z1}, {X1,Y1,Z2}) ->
    Z1 - Z2;
subtr({X1,Y1,Z1}, {X1,Y2,Z2}) ->
    ((Y1 * 1000000) + Z1) - ((Y2 * 1000000) + Z2).

collect_trace_messages() ->
	X when tuple(X), element(1,X) == trace_ts ->
	    [X | collect_trace_messages()];
	X when tuple(X), element(1,X) == trace ->
	    [X | collect_trace_messages()]
    after 0 -> 

into_tab(Method, Tab, Pid, Op, Func, Time) ->
    case Func of
	%% If you wish to do pattern matching beyond what erlang:trace_pattern
	%% can do, put it here.  Too bad it needs to be recompiled to change
	%% the pattern.
	{erlang, make_ref, 0} ->
	    %% Do something first, perhaps?
	    into_tab2(Method, Tab, Pid, Op, Func, Time);
	_ ->
	    into_tab2(Method, Tab, Pid, Op, Func, Time)
into_tab2(Method, Tab, Pid, Op, Func, Time) ->
    LastKey = {last, Pid},
    case ets:lookup(Tab, LastKey) of
	[] ->		% First we've heard from this pid.
	    case Op of
		in ->
		    ets:insert(Tab, {LastKey, Func, Time});
		out ->
		return_to ->
		    ets:insert(Tab, {LastKey, Func, Time});
		call ->
		    ets:insert(Tab, {LastKey, Func, Time});
		gc_start ->
		    %% Pretend a pseudo-function: erlang:gc/0.
		    ets:insert(Tab, {LastKey, ?GARBAGE_FUNC, Time}),
		    %% Gotta push a bogus entry onto the "stack".
		    ets:insert(Tab, {{gc_stack, Pid}, {nofunc, nofunc, 0}, 0});
		gc_end ->
		_ ->
		    io:format("into_tab: first-timers: got op ~w for pid ~w\n", [Op, Pid])
	[{LastKey, LastFunc, LastTime}] -> % Perhaps stackable?
	    TimeDiff = subtr(Time, LastTime),
	    AccKey = {Pid, LastFunc},		% Accumulator key
	    case Op of
		in ->
			Method == swapped ->
			    %% Sum the amount of time we've spent swapped out.
			    sum_it(Tab, {Pid, LastFunc}, TimeDiff, 1);
			true ->
			    %% Don't inflate the number of times the suspended
			    %% function was called.
			    sum_it(Tab, {Pid, LastFunc}, TimeDiff, 0)
		    ets:insert(Tab, {LastKey, Func, Time});
		out ->
			Func == LastFunc ->
			true ->
		    sum_it(Tab, AccKey, TimeDiff, 0),
			Method == swapped ->
			    ets:insert(Tab, {LastKey, {erlang, swapped_out, 0},
			true ->
		return_to ->
		    sum_it(Tab, AccKey, TimeDiff, 0),
		    ets:insert(Tab, {LastKey, Func, Time});
		call ->
		    sum_it(Tab, AccKey, TimeDiff, 0),
		    sum_it(Tab, {Pid, Func}, 0, 1), % Incr count for this func
		    ets:insert(Tab, {LastKey, Func, Time});
		gc_start ->
		    sum_it(Tab, AccKey, TimeDiff, 0),
		    ets:insert(Tab, {LastKey, Func, Time}),

		    %% In this case, we need to remember what was going on
		    %% when the GC interrupted.  Push onto a "stack" (with
		    %% max depth of 1) the tuple from "Perhaps stackable?"
		    %% comment above.  We'll retrieve it when we hit gc_end.
		    ets:insert(Tab, {{gc_stack, Pid}, LastFunc, LastTime});
		gc_end ->
		    sum_it(Tab, {Pid, ?GARBAGE_FUNC}, TimeDiff, 1),
		    [{StackKey, StackFunc, StackTime}] =
			ets:lookup(Tab, {gc_stack, Pid}),
		    ets:insert(Tab, {{last, Pid}, StackFunc, StackTime});
		_ ->
		    io:format("into_tab: repeat: got op ~w for pid ~w\n", [Op, Pid])
	Error ->
	    io:format("ERROR: into_tab: got ~w\n", [Error])

do_messages([{trace_ts, Pid, Op, {M, F, A}, Time}|Tail], Tab, Method, Acc) when list(A) ->
    do_messages([{trace_ts, Pid, Op, {M, F, length(A)}, Time}|Tail], Tab, Method, Acc);
do_messages([{trace_ts, Pid, Op, Func, Time} = Msg|Tail], Tab, Method, Acc) ->
    into_tab(Method, Tab, Pid, Op, Func, Time),
    do_messages(Tail,Tab,Method, [Msg|Acc]);
do_messages([], Tab,Method, Acc) ->

sum_it(Tab, Key, Time, CallIncr) ->
    case ets:lookup(Tab, Key) of
	[] ->
	    ets:insert(Tab, {Key, Time, 1});
	[{Key, TotalTime, TotalCalls}] ->
	    ets:insert(Tab, {Key, TotalTime + Time, TotalCalls + CallIncr})


handle_cast(Req, S) -> {noreply, S}.

terminate(Reason, S) -> normal.


handle_call({logfile, F}, _FromTag, Status) ->
    case file:open(F, write) of
	{ok, Fd} ->
	    case get(fd) of
		undefined -> ok;
		FdOld -> file:close(FdOld)
	    put(fd, Fd),
	    {reply, ok, Status};
	{error, _} ->
	    {reply, error, Status}

handle_call({profile, Rootset}, {From, _Tag}, S) ->
    io:format("~s: Starting profiling ..... ~n",[?MODULE]),
    ptrac(S#state.rootset, false, all()),
    Tab = ets:new(?NAME, [set, public]),
    case ptrac(Rootset, true, all()) of
	false ->
	    {reply, error,  #state{method = S#state.method}};
	true ->
	    {reply, profiling, #state{table = Tab,
				      method = S#state.method,
				      proc = From,
				      profiling = true,
				      history = [],
				      rootset = Rootset}}

handle_call(stop_profiling, _FromTag, S) when S#state.profiling == true ->
    ptrac(S#state.rootset, false, all()),
    io:format("~s: Stop profiling~n",[?MODULE]),
    ets:delete(S#state.table, nofunc),
    {reply, profiling_stopped, S#state{profiling = false}};

handle_call(stop_profiling, _FromTag, S) ->
    {reply, profiling_already_stopped, S};

handle_call({profile, Rootset, M, F, A}, FromTag, S) ->
    io:format("~s: Starting profiling ..... ~n",[?MODULE]),
    ptrac(S#state.rootset, false, all()),
    put(replyto, FromTag),
    Tab = ets:new(?NAME, [set, public]),
    P = spawn_link(?NAME, call, [self(), M, F, A]),
    case ptrac([P|Rootset], true, all()) of
	true ->
	    {noreply, #state{table     = Tab, 
			     method    = S#state.method,
			     profiling = true,
			     rootset   = [P|Rootset]}};
	false ->
	    {reply, error, #state{method = S#state.method}}

handle_call(dump, _FromTag, S) ->
    {reply, dump(S#state.table), S};

handle_call(analyse, _FromTag, S) ->
    {reply, analyse(S#state.table), S};

handle_call(total_analyse, _FromTag, S) ->
    {reply, total_analyse(S#state.table), S};

handle_call({keep_history, Value}, _FromTag, S) ->
    {reply, Value, S#state{keephistory = Value}};

handle_call(get_history, _FromTag, S) ->
    {reply, lists:reverse(S#state.history), S};

handle_call(stop, _FromTag, S) ->
    {stop, normal, stopped, S}.


handle_info({trace_ts, From, Op, Func, Time} = Msg, S) when S#state.profiling == true ->
    put(start, convert_time(erlang:now())),
    Tmsgs = [{trace_ts, From, Op, Func, Time} | collect_trace_messages()],
    Messages = do_messages(Tmsgs, S#state.table, S#state.method, []),
	S#state.keephistory == true ->
	    {noreply, S#state{history = Messages ++ S#state.history}};
	true ->
	    {noreply, S}

handle_info({trace_ts, From, _, _, _}, S) when S#state.profiling == false ->
    ptrac([From], false, all()),
    {noreply, S};

handle_info({P, {answer, A}}, S) ->
    ptrac(S#state.rootset, false, all()),
    io:format("~s: Stop profiling~n",[?MODULE]),
    {From, Tag} = get(replyto),
    catch unlink(From),
    gen_server:reply(erase(replyto), {ok, A}),
    {noreply, S#state{profiling = false,
		      rootset = []}};

handle_info({'EXIT', P, Reason}, S) when S#state.profiling == true,
                                         S#state.proc == P  ->
    ptrac(S#state.rootset, false, all()),
    io:format("~s: Fail profiling~n",[?MODULE]),
    case erase(replyto) of
	undefined ->
	    {noreply, #state{method = S#state.method}};
	FromTag ->
	    gen_server:reply(FromTag, {error, Reason}),
	    {noreply, #state{method = S#state.method}}

handle_info({'EXIT', P, Reason}, S) ->
    {noreply, S}.


call(Top, M, F, A) ->
    Top ! {self(), {answer, apply(M,F,A)}}.

ptrac([all|T], How, Flags) ->
    case dotrace(all, How, Flags) of
	true ->
	    ptrac(T, How, Flags);
	false when How == true ->
	false ->
	    ptrac(T, How, Flags)

ptrac([P|T], How, Flags) when pid(P) ->
    case dotrace(P, How, Flags) of
	true ->
	    ptrac(T, How, Flags);
	false when How == true ->
	false ->
	    ptrac(T, How, Flags)

ptrac([P|T], How, Flags) when atom(P) ->
    case whereis(P) of
	undefined when How == true ->
	undefined when How == false ->
	    ptrac(T, How, Flags);
	Pid ->
	    ptrac([Pid|T], How, Flags)

ptrac([H|T], How, Flags) ->
    io:format("** ~s: Bad process: ~w~n",[?MODULE, H]),

ptrac([],_,_) -> true.

dotrace(P,How,What) ->
    case (catch erlang:trace(P, How, What)) of
	NumPids when integer(NumPids) ->
	    erlang:trace_pattern({'_', '_', '_'}, true),
	Other when How == false ->
	Other ->
	    io:format("** ~s: erlang:trace: bad process ~w: ~w~n",[?MODULE, P, Other]),

all() -> [call, old_call_trace, running, timestamp, set_on_spawn, garbage_collection].

total_analyse(notable) -> 
total_analyse(T) ->
    Pcalls = reverse(keysort(2, replicas(ets:tab2list(T)))),
    Time = collect_times(Pcalls),
    format("FUNCTION~44s      TIME    SECONDS   uSEC/CALL~n", ["CALLS"]),   
    printit(Pcalls, Time).

analyse(notable) -> 
analyse(T) ->
    Pids = ordsets:list_to_set(flatten(ets:match(T, {{'$1','_'},'_', '_'}))),
    Times = sum(ets:match(T, {'_','$1', '_'})),
    format("FUNCTION~44s      TIME ~n", ["CALLS"]),     
    do_pids(Pids, T, 0, Times).

do_pids([Pid|Tail], T, AckTime, Total) ->
    Pcalls = 
     reverse(keysort(2, to_tups(ets:match(T, {{Pid,'$1'}, '$2','$3'})))),
    Time = collect_times(Pcalls),
    PercentTotal = 100 * (divide(Time, Total)),
    format("~n****** Process ~w    -- ~s % of profiled time *** ~n", 
	   [Pid, fpf(PercentTotal)]),
    printit(Pcalls, Time),
    do_pids(Tail, T, AckTime + Time, Total);
do_pids([], _, _, _) -> 

printit([],_) -> ok;
printit([{{Mod,Fun,Arity}, Time, Calls} |Tail], ProcTime)  ->
    format("~s  ~s ~3s % ~9.4f   ~w~n", [ff(Mod,Fun,Arity), fint(Calls),
			      fpf(100*(divide(Time,ProcTime))), Time/1000000,
    printit(Tail, ProcTime);
printit([{{_,{Mod,Fun,Arity}}, Time, Calls} |Tail], ProcTime)  ->
    format("~s  ~s ~3s % ~9.4f   ~w~n", [ff(Mod,Fun,Arity), fint(Calls),
			      fpf(100*(divide(Time,ProcTime))), Time/1000000,
    printit(Tail, ProcTime); 
printit([_|T], Time) ->
    printit(T, Time).

ff(Mod,Fun,Arity) ->
    pad(flatten(io_lib:format("~w:~w/~w", [Mod,Fun, Arity])),45).

pad(Str, Len) -> 
    Strlen = length(Str),
	Strlen > Len -> strip_tail(Str, 45);
	true -> lists:append(Str, mklist(Len-Strlen))

strip_tail([H|T], 0) ->[];
strip_tail([H|T], I) -> [H|strip_tail(T, I-1)];
strip_tail([], I) -> [].

fpf(F) -> strip_tail(flatten(io_lib:format("~.2f", [F])), 5).
fint(Int) -> pad(flatten(io_lib:format("~w",[Int])), 10).

mklist(0) -> [];
mklist(I) -> [$ |mklist(I-1)].

to_tups(L) -> lists:map(fun(List) -> erlang:list_to_tuple(List) end, L).

divide(X,Y) -> X / Y.

collect_times([]) -> 0;
collect_times([Tup|Tail]) -> element(2, Tup) + collect_times(Tail).

dump(T) ->
    L = ets:tab2list(T),

format([H|T]) -> 
    format("~p~n", [H]), format(T);
format([]) -> ok.

format(F, A) ->
    case get(fd) of
	undefined -> ok;
	Fd -> io:format(Fd, F,A)

maybe_delete({T,Ref}) when reference(Ref) ->
    ets:delete({T, Ref});
maybe_delete(_) -> ok.

%% Try to figure out how much time a single erlang:time() call takes.

onecalltime() -> hd(lists:sort([oct(), oct(), oct()])).

oct() ->
    garbage_collect(),  %% ehhh
    N = erlang:now(),
    Time1 = subtr(erlang:now(), N) div 100,

    garbage_collect(),  %% ehhh
    N2 = erlang:now(),
    Time2 = subtr(erlang:now(), N2) div 100,
    (Time1 - Time2) div 2.

sched_time() ->
    P = spawn(?NAME, echo, []),
    X = erlang:now(),
    P ! self(),
    receive P -> ok end,
    subtr(erlang:now(), X).

echo() ->
    receive P -> P ! self() end.

call_loop(0,_) -> ok;
call_loop(I,time) ->
    erlang:now(), call_loop(I-1,time);
call_loop(I, notime) ->
    call_loop(I-1, notime).

convert_time({Msecs,Secs,Mysecs}) ->
    (1000000000000 * Msecs) + (1000000 * Secs) + Mysecs.

sum([[H]|T]) -> H + sum(T);
sum([]) -> 0.

replicas(L) ->
    replicas(L, []).

replicas([{{Pid, {Mod,Fun,Arity}}, Ack,Calls} |Tail], Result) ->
    case search({Mod,Fun,Arity},Result) of
	false ->
	    replicas(Tail, [{{Pid, {Mod,Fun,Arity}}, Ack,Calls} |Result]);
	{Ack2, Calls2} ->
	    Result2 = del({Mod,Fun,Arity}, Result),
	    replicas(Tail, [{{Pid, {Mod,Fun,Arity}}, 
			     Ack+Ack2,Calls+Calls2} |Result2])

replicas([_|T], Ack) ->  %% Whimpy
    replicas(T, Ack);

replicas([], Res) -> Res.

search(Key, [{{_,Key}, Ack, Calls}|_]) -> 
    {Ack, Calls};
search(Key, [_|T]) -> 
    search(Key, T);
search(Key, []) -> false.

del(Key, [{{_,Key}, Ack, Calls}|T]) ->
del(Key, [H | Tail]) ->
    [H|del(Key, Tail)];
del(Key, []) -> [].

flush_receive() ->
	{trace_ts, From, _, _, _} when pid(From) ->
	    ptrac([From], false, all()),
	_ ->
    after 0 ->

load_check() ->
load_check([{Mod, File} |Tail]) ->
    load_check_mod(Mod, keysearch(options, 1,
				  apply(Mod, module_info,[compile]))),
load_check([]) -> done.

load_check_mod(Mod, {value, {_, Opts}}) ->
    case member(trace, Opts) of
	true -> true;
	false -> 
	    io:format("** ~s: Warning module ~w not trace compiled ~n", 
		      [?MODULE, Mod])
load_check_mod(Mod, _) ->
    io:format("** ~s: No compile_opts found in ~w:module_info()~n", [?MODULE, Mod]).

More information about the erlang-questions mailing list