# Traversing a binary

Tony Rogvall <>
Mon Oct 4 23:17:38 CEST 2004

måndagen den 4 oktober 2004 kl 17.01 skrev Luke Gorrie:
>
> Maybe bignums could be some fun?
>
Yes!

> In RFC 1071 ("Computing the internet checksum") they have this tip:
>
>    (C)  Parallel Summation
>
>         On machines that have word-sizes that are multiples of 16 bits,
>         it is possible to develop even more efficient implementations.
>         Because addition is associative, we do not have to sum the
>         integers in the order they appear in the message.  Instead we
>         can add them in "parallel" by exploiting the larger word size.
>
>         To compute the checksum in parallel, simply do a 1's complement
>         addition of the message using the native word size of the
>         machine.  For example, on a 32-bit machine we can add 4 bytes
> at
>         a time: [A,B,C,D]+'... When the sum has been computed, we
> "fold"
>         the long sum into 16 bits by adding the 16-bit segments.  Each
>         16-bit addition may produce new end-around carries that must be
>
>
Added my flavor together with some results ...

First I did the "hardcoded" version cs_16/cs_32/cs_48/cs_64 then I
takes n*16 bits at a time...

The result shows that somewhere over 400 bits gives the good runtime,
even better results
with HIPE (but not presented here)
results where produced with a 800MHz Celeron
input size 3120151
this means that we can checksum about 14M/s
testing a file (otp_src_R8B-1.tar.gz 10914087) takes 0.77 sec which
approx verifies this,
running cs_n(Bin, 62)    (992 bits)

Cool stuff, Luke :-)

/Tony

Hardcoded versions:
16_h: {865282,45447}
32_h: {492006,45447}
48_h: {570898,45447}
64_h: {470681,45447}

General versions:
16_n: {1229178,45447}
32_n: {714159,45447}
48_n: {622312,45447}
64_n: {480053,45447}
80_n: {433242,45447}
96_n: {376938,45447}
112_n: {350193,45447}
128_n: {330382,45447}
144_n: {314732,45447}
160_n: {295219,45447}
176_n: {291303,45447}
192_n: {282718,45447}
208_n: {274458,45447}
224_n: {266926,45447}
240_n: {274168,45447}
256_n: {267168,45447}
272_n: {252319,45447}
288_n: {278400,45447}
304_n: {244431,45447}
320_n: {240133,45447}
336_n: {240713,45447}
352_n: {235871,45447}
368_n: {236378,45447}
384_n: {236018,45447}
400_n: {227867,45447}    <-  Here we start to see the good results
416_n: {227723,45447}
432_n: {228095,45447}
448_n: {226237,45447}
464_n: {221365,45447}
480_n: {217066,45447}
496_n: {240761,45447}
512_n: {237357,45447}
528_n: {244091,45447}
544_n: {235852,45447}
560_n: {239366,45447}
576_n: {234818,45447}
592_n: {244486,45447}
608_n: {238073,45447}
624_n: {265506,45447}
640_n: {227704,45447}
656_n: {228000,45447}
672_n: {225240,45447}
688_n: {237404,45447}
704_n: {226244,45447}
720_n: {228811,45447}
736_n: {227854,45447}
752_n: {240890,45447}
768_n: {240827,45447}
784_n: {221820,45447}
800_n: {222873,45447}
816_n: {218731,45447}
832_n: {219575,45447}
848_n: {216197,45447}
864_n: {214653,45447}
880_n: {220941,45447}
896_n: {217853,45447}
912_n: {218837,45447}
928_n: {220978,45447}
944_n: {217725,45447}
960_n: {217103,45447}
976_n: {248560,45447}
992_n: {215693,45447}  <- BEST so far
1008_n: {234033,45447}  <- Getting worse (need to run more loops to
check)
1024_n: {232044,45447}

%%
%% Checksum code (as stolen and patched from luke)
%%
-module(csum).

-compile(export_all).

%% -compile(inline).

t() ->
C = <<B/binary,B/binary,B/binary,B/binary,B/binary>>,
D = <<C/binary,C/binary,C/binary,C/binary,C/binary>>,
E = <<D/binary,D/binary,D/binary,D/binary,D/binary,
D/binary,D/binary,D/binary,D/binary,D/binary,
D/binary,D/binary,D/binary,D/binary,D/binary,
D/binary,D/binary,D/binary,D/binary,D/binary,
D/binary,D/binary,42:8/integer>>,
%% checksum_4 crashes on this:
%%    F = <<E/binary,E/binary,E/binary,E/binary,E/binary>>,
%% and performs better on this:
%%    F = D,
F = E,
io:format("N=~p~n", [size(F)]),
garbage_collect(),
io:format("M1: ~w~n",[timer:tc(?MODULE,checksum_1,[F,0])]),
garbage_collect(),
io:format("M2: ~w~n",[timer:tc(?MODULE,checksum_2,[F,0])]),
garbage_collect(),
io:format("M3: ~w~n",[timer:tc(?MODULE,checksum_3,[F,0])]),
garbage_collect(),
io:format("M4: ~w~n",[timer:tc(?MODULE,checksum_4,[F])]),
garbage_collect(),
io:format("M5: ~w~n",[timer:tc(?MODULE,c,[0,F,0])]),

garbage_collect(),
io:format("16_h: ~w~n",[timer:tc(?MODULE,cs_16,[F])]),
garbage_collect(),
io:format("32_h: ~w~n",[timer:tc(?MODULE,cs_32,[F])]),
garbage_collect(),
io:format("48_h: ~w~n",[timer:tc(?MODULE,cs_48,[F])]),
garbage_collect(),
io:format("64_h: ~w~n",[timer:tc(?MODULE,cs_64,[F])]),
%%
t_loop(1, 64, F).

t_loop(I, N, F) when I =< N ->
garbage_collect(),
io:format("~w_n: ~w~n",[I*16,timer:tc(?MODULE,cs_n,[F,I])]),
t_loop(I+1,N, F);
t_loop(_, _, _) ->
ok.

cs_16(Bin) ->
cs_16(0, Bin, 0).

cs_16(I, Bin, Csum) ->
case Bin of
<<_:I/binary, Num:16, _/binary>> -> cs_16(I+2, Bin, Csum+Num);
<<_:I/binary, Num:8>> -> cs_fold(Csum+(Num bsl 8));
<<_:I/binary>> -> cs_fold(Csum)
end.

cs_32(Bin) ->
cs_32(0, Bin, 0).

cs_32(I, Bin, Csum) ->
case Bin of
<<_:I/binary, Num:32, _/binary>> -> cs_32(I+4, Bin, Csum+Num);
<<_:I/binary, Num:24>> -> cs_fold(Csum+(Num bsl 8));
<<_:I/binary, Num:16>> -> cs_fold(Csum+(Num bsl 16));
<<_:I/binary, Num:8>> -> cs_fold(Csum+(Num bsl 24));
<<_:I/binary>> -> cs_fold(Csum)
end.

cs_48(Bin) ->
cs_48(0, Bin, 0).

cs_48(I, Bin, Csum) ->
case Bin of
<<_:I/binary, Num:48, _/binary>> -> cs_48(I+6, Bin, Csum+Num);
<<_:I/binary, Num:40>> -> cs_fold(Csum+(Num bsl 8));
<<_:I/binary, Num:32>> -> cs_fold(Csum+(Num bsl 16));
<<_:I/binary, Num:24>> -> cs_fold(Csum+(Num bsl 24));
<<_:I/binary, Num:16>> -> cs_fold(Csum+(Num bsl 32));
<<_:I/binary, Num:8>>  -> cs_fold(Csum+(Num bsl 40));
<<_:I/binary>> -> cs_fold(Csum)
end.

cs_64(Bin) ->
cs_64(0, Bin, 0).

cs_64(I, Bin, Csum) ->
case Bin of
<<_:I/binary, Num:64, _/binary>> -> cs_64(I+8, Bin, Csum+Num);
<<_:I/binary, Num:56>> -> cs_fold(Csum+(Num bsl 8));
<<_:I/binary, Num:48>> -> cs_fold(Csum+(Num bsl 16));
<<_:I/binary, Num:40>> -> cs_fold(Csum+(Num bsl 24));
<<_:I/binary, Num:32>> -> cs_fold(Csum+(Num bsl 32));
<<_:I/binary, Num:24>> -> cs_fold(Csum+(Num bsl 40));
<<_:I/binary, Num:16>> -> cs_fold(Csum+(Num bsl 48));
<<_:I/binary, Num:8>>  -> cs_fold(Csum+(Num bsl 56));
<<_:I/binary>> -> cs_fold(Csum)
end.

cs_128(Bin) ->
cs_n(Bin, 8).

cs_256(Bin) ->
cs_n(Bin, 16).

%% General loop
cs_n(Bin, N) ->
Bits = N*16,
cs_n(0, 2*N, Bits, Bin, 0).

cs_n(I, N, Bits, Bin, Csum) ->
case Bin of
<<_:I/binary, Num:Bits, _/binary>> -> cs_n(I+N,N,Bits,Bin,Csum+Num);
<<_:I/binary, Rest/binary>> ->
RestBits = size(Rest)*8,
<<Num:RestBits>> = Rest,
cs_fold(Csum + (Num bsl (Bits-RestBits)))
end.

cs_fold(Csum) when Csum > 16#ffff ->
cs_fold((Csum bsr 16) + (Csum band 16#ffff));
cs_fold(Csum) -> Csum.

checksum_1(<<Num:16/integer, Remainder/binary>>, Csum) ->
checksum_1(Remainder, Csum + Num);
checksum_1(<<Num:8/integer>>,CSum) ->
checksum_1(<<>>,CSum+(Num bsl 8));
checksum_1(_, Csum) when Csum > 16#ffff ->
checksum_1(<<>>,Csum band 16#ffff + (Csum bsr 16));
checksum_1(_,CSum) -> CSum.

checksum_2(Bin = <<N1:16/integer, N2:16/integer, N3:16/integer,
N4:16/integer, N5:16/integer, N6:16/integer,
N7:16/integer, N8:16/integer, Rem/binary>>,
Csum) when size(Bin)>=16 ->
checksum_2(Rem, Csum+N1+N2+N3+N4+N5+N6+N7+N8);
checksum_2(Bin,Csum) ->
checksum_1(Bin,Csum).

checksum_3(Bin = <<N1:16/integer, N2:16/integer, N3:16/integer,
N4:16/integer, N5:16/integer, N6:16/integer,
N7:16/integer, N8:16/integer, Rem/binary>>,
Csum) when size(Bin)>=16 ->
checksum_3(Rem,
'+'(Csum,'+'(N1,'+'(N2,'+'(N3,'+'(N4,'+'(N5,'+'(N6,'+'(N7,N8)))))))));
checksum_3(Bin,Csum) ->
c(0,Bin,Csum).

checksum_4(Bin0) ->
%% Pad the binary to even size if needed.
Bin = case size(Bin0) rem 2 of
0 -> Bin0;
1 -> <<Bin0/binary, 0>>
end,
Bits = size(Bin) * 8,
%% C is the "machine word" (arbitrary precision) checksum
<<C:Bits>> = Bin,
checksum_4(mul16(Bits, 16), C).

checksum_4(16, C) -> C;
checksum_4(B0, C0) when B0 > 16 ->
B1 = B0 div 2,
Mask = (1 bsl B1) -1,
High = C0 bsr B1,
C1 = High + Low,
checksum_4(B1, C1 band Mask + (C1 bsr B1)).

%% Return the smallest multiple of 16 that is greater than N.
%% (Where's the logarithm function?)
mul16(N, Acc) when Acc > N -> Acc;
mul16(N, Acc) -> mul16(N, Acc*16).

c(N,Bin,Csum) ->
case Bin of
<<_:N/binary, Num:16/integer,_/binary>> ->
c(N+2,Bin,'+'(Csum,Num));
<<_:N/binary, Num:8/integer>> ->
'+'(Csum,Num bsl 8);
_ -> Csum
end.

'+'(A,B) ->
S = A + B,
S band 16#ffff + (S bsr 16).