[erlang-questions] A bug? - bloom filters - and loadable BIFs

Joe Armstrong erlang@REDACTED
Wed Apr 29 09:28:02 CEST 2009


Hi,

I recent got mail from Steve Kirsch (Hello Steve) - asking a number
of questions about representation of bit vectors for writing Bloom filters.

Having read my book he thought I could answer all questions about Erlang
little did he know that the collective intelligence of the people reading this
list far exceeds what I could contribute :-)

So he sent me a program, which crashes his machine (out of memory)
which he thought
should work - to my eye it looks ok - It will not be blindingly fast,
but it looks properly
tail-recursive so the garbage collector should not have any problems.

I've appended at the end of this mail - so you can see for yourself.
It is a GC bug
or a user error?

The next question is "what is a suitable data structure for
representing a large bit vector"
(we want to flip specific bits) - the "pure" answer I guess is a tuple
of tuples of binaries
(say 32x32xB) where B is a 128 byte binary for 2^20 bits.

The "tuple of tuple of tuple ..." representation is commonly used to
implement array and dicts

Possibly we could just use array.erl creating an array of 32x32
elements containing 128 byte binaries.

If this is not efficient enough we would turn to C.

A linked-in driver seems overkill here - we can't send a
large binary back and forth over an I/O interface - so this needs a BIF.

What I'd like is a "linked in BIF" - it seems to me to be a bit of a
pain in the whatnot
to have to mess around with the Erlang internals every time you want
to write a new BIF.

Much nicer to have a linked-in BIF - on demand loading of C code is pretty easy
so this should not be a problem. One could also impose a measure of
stack discipline
for linked-in BIFs (as in the JVM) - this would make it pretty easy to
write extensions.

Nuff said ...

/Joe


Now the origonal post that sparked this thread ..

------ cut --- below this line is from Steve's post-----

...

i tested the erlang bloom filter I found in google code which i've
included below.

Doing 10,000 inserts per my test routine (which allocates a 1M key
bitmap to start to make it "tough" for erlang) took 423 cpu seconds,
which is only 23 inserts/sec!?!?! That is ludicrous as I'm sure you'd
agree

And if you tried only 100,000 inserts into my 1.8MB table, erlang
crashes:
%% binary_alloc: Cannot allocate 3594429 bytes of memory (of type
"binary").

The book is pretty much silent on how you'd implement this important
class of functions.

Nor does it talk about why this code fails, though I'd imagine it is
copying and gc'ing large binaries like crazy every time you change 1
bit.

I think the book should talk about is there a way to write this code
with binaries and work? Or do you have to use arrays or some other
means?



%% @doc Implementation of the Bloom filter data structure.
%% @reference [http://en.wikipedia.org/wiki/Bloom_filter]

-module(bitmap).
-export([test/1, new/1, new/2, is_bloom/1, is_element/2,
add_element/2]).
-import(math, [log/1, pow/2]).
-import(erlang, [phash2/2]).


% B0 = bloom:new(2000, 0.001).
% bloom:is_bloom(B0).
% B1 = bloom:add_element(Key, B0).
% bloom:is_element(Key, B1).


-record(bloom, {
   m      = 0,       % The size of the bitmap in bits.
   bitmap = <<>>,    % The bitmap.
   k      = 0,       % The number of hashes.
   n      = 0,       % The maximum number of keys.
   keys   = 0        % The current number of keys.
}).

%% @spec new(capacity) -> bloom()
%% @equiv new(capacity, 0.001)
new(N) -> new(N, 0.001).

%% @spec new(integer(), float()) -> bloom()
%% @doc Creates a new Bloom filter, given a maximum number of keys and a
%%     false-positive error rate.
new(N, E) when N > 0, is_float(E), E > 0, E =< 1 ->
   {M, K} = calc_least_bits(N, E),
   #bloom{m=M, bitmap = <<0:((M+7) div 8 * 8)>>, k=K, n=N}.

%% @spec is_bloom(bloom()) -> bool()
%% @doc Determines if the given argument is a bloom record.
is_bloom(#bloom{}) -> true;
is_bloom(_) -> false.

%% @spec is_element(string(), bloom()) -> bool()
%% @doc Determines if the key is (probably) an element of the filter.
is_element(Key, B) -> is_element(Key, B, calc_idxs(Key, B)).
is_element(_, _, []) -> true;
is_element(Key, B, [Idx | T]) ->
   ByteIdx = Idx div 8,
   <<_:ByteIdx/binary, Byte:8, _/binary>> = B#bloom.bitmap,
   Mask = 1 bsl (Idx rem 8),
   case 0 =/= Byte band Mask of
        true -> is_element(Key, B, T);
       false -> false
   end.

%% @spec add_element(string(), bloom()) -> bloom()
%% @doc Adds the key to the filter.
add_element(Key, #bloom{keys=Keys, n=N, bitmap=Bitmap} = B) when Keys <
N ->
   Idxs = calc_idxs(Key, B),
   Bitmap0 = set_bits(Bitmap, Idxs),
   case Bitmap0 == Bitmap of
        true -> B;    % Don't increment key count for duplicates.
       false -> B#bloom{bitmap=Bitmap0, keys=Keys+1}
   end.

set_bits(Bin, []) -> Bin;
set_bits(Bin, [Idx | Idxs]) ->
   ByteIdx = Idx div 8,
   <<Pre:ByteIdx/binary, Byte:8, Post/binary>> = Bin,
   Mask = 1 bsl (Idx rem 8),
   Byte0 = Byte bor Mask,
   set_bits(<<Pre/binary, Byte0:8, Post/binary>>, Idxs).

% Find the optimal bitmap size and number of hashes.
calc_least_bits(N, E) -> calc_least_bits(N, E, 1, 0, 0).
calc_least_bits(N, E, K, MinM, BestK) ->
   M = -1 * K * N / log(1 - pow(E, 1/K)),
   {CurM, CurK} = if M < MinM -> {M, K}; true -> {MinM, BestK} end,
   case K of
         1 -> calc_least_bits(N, E, K+1, M, K);
       100 -> {trunc(CurM)+1, CurK};
         _ -> calc_least_bits(N, E, K+1, CurM, CurK)
   end.

% This uses the "enhanced double hashing" algorithm.
% Todo: handle case of m > 2^32.
calc_idxs(Key, #bloom{m=M, k=K}) ->
   X = phash2(Key, M),
   Y = phash2({"salt", Key}, M),
   calc_idxs(M, K - 1, X, Y, [X]).
calc_idxs(_, 0, _, _, Acc) -> Acc;
calc_idxs(M, I, X, Y, Acc) ->
   Xi = (X+Y) rem M,
   Yi = (Y+I) rem M,
   calc_idxs(M, I-1, Xi, Yi, [Xi | Acc]).

test(N)->
   B0 = new(1000000, 0.001),
   F=fun(Key, Bloom)-> add_element(Key, Bloom) end,
   lists:foldl(F, B0, lists:seq(1,N)).

%% to test: B1=bitmap:test(1000000).
%% which crashes erlang even if only 100,000 entries


%% Crash dump was written to: erl_crash.dump
%% binary_alloc: Cannot allocate 3594429 bytes of memory (of type
"binary").

%% This application has requested the Runtime to terminate it in an
unusual way.
%% Please contact the application's support team for more information.

%% Process inferior-erlang exited abnormally with code 3



More information about the erlang-questions mailing list