[erlang-questions] Erlang shows its slow face!

Sun Nov 14 01:33:25 CET 2010

PARALLELISE, PARALLELISE, PARALLELISE!!!

I took your second algo and Tony's improved version (both pretty much  
unchanged) and parallelised them below with little effort. Permutations  
for B and C are calculated on a separate concurrent* process for every A  
then the result is combined (e.g when N = 300, 300 concurrent worker  
processes are spawned.)

I consistently obtained a 40-45% improvement in execution time regardless  
of algo. I also noticed that non-parallel versions were using only ONE of  
my cores whereas parallel versions were using BOTH my cores. Here are the  
results for N = 300 (debug_info BEAM complied)...

                 Sequential      Concurrent
Your original   8.44s           N/A
Your py2        9.86s           5.51s
Tony's pythag2  1.95s           1.17s**

* This is just to illustrate that with any language, one should focus on  
it's promises/strengths. Sometimes this can even de-emphasise it's  
weaknesses. Though I still think it's best to use a tool that makes  
pledges towards the problem area in the first place.

** This is probably roughly as fast the C# implementation but that's  
besides the point!

I agree with Bernard. IMO, you're barking up the wrong tree for doing this  
sort of thing quickly using either Erlang or C#. When you look closer, the  
bottleneck with all the solutions so far isn't the calculation itself (I  
was wrong about that earlier) -- it's actually the permutation (the part  
done with accumulators/generators). Possibly a language with pointers  
would be better at doing this fast since you'd be able to allocate a  
fixed-size destructive memory structure to walk through and modify (that's  
just an educated guess). I'd put my money on the horse named C NIF.

Anyways, here's the parallelised code...

NB: lpmap is a library function best stuck in a separate utility module.  
It's my modified + edocumented version of Joe Armstrong's pmap from his  
book "Programming Erlang: Software for a Concurrent World" Section 20.2  
"Parallelizing Sequential Code" (go buy it!) The original source is at  
http://www.pragprog.com/titles/jaerlang/source_code/

---- START CODE ----

py2(Max)->
     lists:flatten(lpmap(fun(A) ->
                             forb(A, 1, [], Max)
                         end, lists:seq(1, Max), ordered)).

forb(A, B, Acc, Max) ->
     Acc1 = forc(A, B, 1, Acc, Max),
     case B < Max of
         true -> forb(A, B+1, Acc1, Max);
         false -> Acc1
     end.

forc(A, B, C, Acc, Max) ->
     Acc1 = case (A*A + B*B =:= C*C) andalso (A+B+C =< Max) of
                 true -> [{A,B,C}|Acc];
                 _ -> Acc
            end,
     case C < Max of
         true-> forc(A, B, C+1, Acc1, Max);
         false-> Acc1
     end.

pythag2(N)->
     lists:flatten(lpmap(fun(A) ->
                             pythan2_B(A, 1, N, [])
                         end, lists:seq(1, N), ordered)).

pythan2_A(A, N, Acc) when A > N -> Acc;
pythan2_A(A, N, Acc) -> pythan2_A(A+1,N,pythan2_B(A, 1, N, Acc)).

pythan2_B(A, B, N, Acc) when A+B > N -> Acc;
pythan2_B(A, B, N, Acc) -> pythan2_B(A,B+1,N,pythan2_C(A, B, 1, N, Acc)).

pythan2_C(A, B, C, N, Acc) when A+B+C > N -> Acc;
pythan2_C(A, B, C, N, Acc) ->
     if A*A+B*B =:= C*C ->
	    pythan2_C(A, B, C+1, N, [{A,B,C}|Acc]);
        true ->
	    pythan2_C(A, B, C+1, N, Acc)
     end.

%% @spec    lpmap(fun(), list(), (atom() = ordered|unordered)) -> list()
%% @doc     Spawns a process for each element in list L, performs specified
%%          function F against each in parallel and then returns results  
either
%%          same order as L (ordered) or in any order (unordered).
%%          NB: See also lpmap/4.

lpmap(F, L, ordered) ->
     Ref = erlang:make_ref(),
     Pids = [lpmap_spawn_link(self(), Ref, F, I) || I <- L],
     lpmap_gather_ordered(Pids, Ref, [], 0, void);
lpmap(F, L, unordered) ->
     Ref = erlang:make_ref(),
     lists:foreach(fun(I) ->
                     lpmap_spawn_link(self(), Ref, F, I)
                   end, L),
     lpmap_gather_unordered(length(L), Ref, [], 0, void).

%% @spec    lpmap(fun(), integer(), list(), (atom() = ordered|unordered))  
-> list()
%% @doc     Same as lpmap/3 except ensures only a maximum of MaxPs parallel
%%          processes execute function F at any one time (i.e. first takes  
MaxPs
%%          items from list, executes F in parallel against each, then as  
each
%%          process returns, spawns another process on next item in L as  
long as
%%          active processes are less than MaxPs).
%%          NB: See also lpmap/4.

lpmap(F, L, MaxPs, ordered) when MaxPs>0 ->
     Ref = erlang:make_ref(),
     {HPids, TPids} = if
                         length(L) > MaxPs -> lists:split(MaxPs, L);
                         true -> {L, []}
                      end,
     Pids = [lpmap_spawn_link(self(), Ref, F, I) || I <- HPids],
     lpmap_gather_ordered(Pids, Ref, TPids, MaxPs, F);
lpmap(F, L, MaxPs, unordered) when MaxPs>0 ->
     Ref = erlang:make_ref(),
     {HPids, TPids} = if
                         length(L) > MaxPs -> lists:split(MaxPs, L);
                         true -> {L, []}
                      end,
     lists:foreach(fun(I) ->
                     lpmap_spawn_link(self(), Ref, F, I)
                   end, HPids),
     lpmap_gather_unordered(length(HPids), Ref, TPids, MaxPs, F).

%% lpmap internal functions

lpmap_spawn_link(Parent, Ref, F, I) ->
     spawn_link(fun() ->
                     Parent ! {self(), Ref, F(I)}
                end).

lpmap_gather_ordered([], _Ref, [], _MaxPs, _F) ->
     [];
lpmap_gather_ordered([HPid|TPids], Ref, L, MaxPs, F) ->
     receive
         {HPid, Ref, Ret} when length(TPids)<MaxPs, L=/=[] ->
             [H | T] = L,
             [Ret | lpmap_gather_ordered(
                 lists:append(TPids, [lpmap_spawn_link(self(), Ref, F, H)]),
                 Ref, T, MaxPs, F)];
         {HPid, Ref, Ret} ->
             [Ret | lpmap_gather_ordered(TPids, Ref, L, MaxPs, F)]
     end.

lpmap_gather_unordered(0, _Ref, [], _MaxPs, _F) ->
     [];
lpmap_gather_unordered(NPs, Ref, L, MaxPs, F) ->
     receive
         {_Pid, Ref, Ret} when NPs-1<MaxPs, L=/=[] ->
             [H | T] = L,
             lpmap_spawn_link(self(), Ref, F, H),
             [Ret | lpmap_gather_unordered(NPs, Ref, T, MaxPs, F)];
         {_Pid, Ref, Ret} ->
             [Ret | lpmap_gather_unordered(NPs-1, Ref, L, MaxPs, F)]
     end.

---- END CODE -----

- Edmond -

On Sun, 14 Nov 2010 02:59:18 +1100, Tony Rogvall <tony@REDACTED> wrote:

> Hi Gilberto!
>
> I did some rewrite of the FORM 2, that I think is more compatible with  
> the condition that A+B+C =< N,
> and the result is of course much better. How did you implement the C#  
> version ?
>
> My results for N = 300, times are measured with timer:tc.
>
> pythag0:		4837528		(4.8s)
> pythag1:		4289769		(4.3s)
> pythag2:		703632			(0.7s)
>
> And the result with +native
>
> pythag0:		1006865		(1.0s)
> pythag1:		441254			(0.4s)
> pythag2:		107387			(0.1s)
>
> /Tony
>
> My versions (original + the version that only generates the list once)
>
> -module(pythag).
>
> -compile(export_all).
>
> pythag0(N) ->
>     [ {A,B,C} ||
> 	A <- lists:seq(1,N),
> 	B <- lists:seq(1,N),
> 	C <- lists:seq(1,N),
> 	A+B+C =< N,
> 	A*A+B*B =:= C*C].
>
> pythag1(N) ->
>     L = lists:seq(1,N),
>     [ {A,B,C} ||
> 	A <- L,
> 	B <- L,
> 	C <- L,
> 	A+B+C =< N,
> 	A*A+B*B =:= C*C].
>
>
> pythag2(N) ->
>     lists:reverse(pythan2_A(1, N, [])).
>
> pythan2_A(A, N, Acc) when A > N -> Acc;
> pythan2_A(A, N, Acc) -> pythan2_A(A+1,N,pythan2_B(A, 1, N, Acc)).
>
> pythan2_B(A, B, N, Acc) when A+B > N -> Acc;
> pythan2_B(A, B, N, Acc) -> pythan2_B(A,B+1,N,pythan2_C(A, B, 1, N, Acc)).
>
> pythan2_C(A, B, C, N, Acc) when A+B+C > N -> Acc;
> pythan2_C(A, B, C, N, Acc) ->
>     if A*A+B*B =:= C*C ->
> 	    pythan2_C(A, B, C+1, N, [{A,B,C}|Acc]);
>        true ->
> 	    pythan2_C(A, B, C+1, N, Acc)
>     end.
>
>
>
>
>
> On 13 nov 2010, at 08.37, Gilberto Carmenate García wrote:
>
>> Hi all!
>> I have been doing tests to Erlang I found this funny stuff that makes
>> Pythagorean Triplets
>>
>> pythag(N) ->
>>    [ {A,B,C} ||
>>        A <- lists:seq(1,N),
>>        B <- lists:seq(1,N),
>>        C <- lists:seq(1,N),
>>        A+B+C =< N,
>>        A*A+B*B =:= C*C].
>>
>> I tested it agains an implementation I made in C# so, and takes 14
>> secounds in my pc to do with 300 numbers in Erlang however in c# is just
>> a secound, even when C# runs under VM too.
>> So I did all possible ways for me to implement differents manners in
>> Erlang looking for speed and all is the same, listed as  follows:
>>
>> So my question is, there are any way to do that even more fast, why 3
>> nestes fors structs in C# are more effients that lists:foldr or
>> lists:foreach in Erlang.
>>
>> %% FORMA 1
>> py1(Max)->
>>    L = lists:seq(1, Max),
>>    lists:foldr(
>>        fun(A, Acc3)->
>>            lists:foldr(
>>                fun(B, Acc2)->
>>                    lists:foldr(
>>                        fun(C, Acc)->
>>                            case ((A*A + B*B =:= C*C) andalso (A+B+C =<
>> Max)) of
>> 								true->
>>                                    [{A,B,C}|Acc];
>>                                false->
>>                                    Acc
>>                            end
>>                        end
>>                    , Acc2, L)
>>                end
>>            , Acc3, L)
>>        end
>>    , [], L).
>>
>>
>>
>> %% FORMA 2
>> py2(Max)->
>> 	fora(1, [], Max).
>>
>> fora(A, Acc, Max)->
>> 	Acc1 = forb(A,1, Acc, Max),
>> 	case A < Max of
>>        true->
>>            fora(A+1, Acc1, Max);
>>        false->
>>            Acc1
>>    end.
>>
>> forb(A,B, Acc, Max)->
>>    Acc1 = forc(A,B,1, Acc, Max),
>>    case B < Max of
>>        true->
>>            forb(A,B+1, Acc1, Max);
>>        false->
>>            Acc1
>>    end.
>>
>> forc(A,B,C, Acc, Max)->
>>    Acc1 = case (A*A + B*B =:= C*C) andalso (A+B+C =< Max) of
>>        true->
>>            [{A,B,C}|Acc];
>>        _->
>>            Acc
>>    end,
>>    case C < Max of
>>        true->
>>            forc(A,B,C+1, Acc1, Max);
>>        false->
>>            Acc1
>>    end.
>>
>> %% FORMA 3.
>> py3(Max)->
>> 	[{A,B,C} ||
>> 		A <-lists:seq(1, Max),
>> 		B <-lists:seq(1, Max),
>> 		C <-lists:seq(1, Max),
>> 		A*A + B*B =:= C*C,
>> 		A+B+C =< Max].
>>
>>
>> =======================================================================
>> Este mensaje ha sido enviado mediante el servicio de correo electronico  
>> que ofrece la Federacion de Radioaficionados de Cuba a sus miembros  
>> para respaldar el cumplimiento de los objetivos de la organizacion y su  
>> politica informativa. La persona que envia este correo asume el  
>> compromiso de  usar el servicio a tales fines y cumplir con las  
>> regulaciones establecidas.
>>
>>
>>
>> ________________________________________________________________
>> erlang-questions (at) erlang.org mailing list.
>> See http://www.erlang.org/faq.html
>> To unsubscribe; mailto:erlang-questions-unsubscribe@REDACTED
>>
>
> "Have run Make so many times I dunno what's installed anymore"
>

-- 
Using Opera's revolutionary e-mail client: http://www.opera.com/mail/