[erlang-questions] Re: data sharing is outside the semantics of Erlang, but it sure is useful
Paul Mineiro
paul-trapexit@REDACTED
Tue Sep 15 23:25:19 CEST 2009
i use erlang:term_to_binary to persist items in databases all the time, so
this thread caught my attention.
to stimulate discussion i pounded out some code that does hash consing
(hashcons:share/1) and that does encoding and decoding suitable for
composition with erlang:term_to_binary/2 (hashcons:encode/1) and
erlang:binary_to_term/1 (hashcons:decode/1). encoded terms don't
necessarily have a smaller wire footprint even in a very favorable case
(see below), but should decode into a shared data structure (as opposed to
calling hashcons:share/1 after calling erlang:binary_to_term/1, which
could create a temporary which is unacceptably huge).
attached.
-- p
% erl
Erlang (BEAM) emulator version 5.6.5 [source] [async-threads:0] [kernel-poll:false]
Eshell V5.6.5 (abort with ^G)
1> hashcons:encode (lists:duplicate (10, wazzup)).
{[0,0,0,0,0,0,0,0,0,0],[{0,wazzup}]}
2> size (erlang:term_to_binary (hashcons:encode (lists:duplicate (10, wazzup)), [ compressed ])).
35
3> size (erlang:term_to_binary (lists:duplicate (10, wazzup), [ compressed ])).
31
4> % not always smaller
4> size (erlang:term_to_binary (hashcons:encode (lists:duplicate (100, wazzup)), [ compressed ])).
38
5> size (erlang:term_to_binary (lists:duplicate (100, wazzup), [ compressed ])).
37
-------------- next part --------------
-module (hashcons).
-export ([ share/1,
encode/1,
decode/1 ]).
-include_lib ("eunit/include/eunit.hrl").
%-=====================================================================-
%- hash consing -
%-=====================================================================-
share (Term) ->
element (1, share (Term, dict:new ())).
share (Term, Dict) ->
case dict:find (Term, Dict) of
{ ok, OrigTerm } ->
{ OrigTerm, Dict };
error ->
{ NewTerm, NewDict } = share_copy (Term, Dict),
{ NewTerm, dict:store (NewTerm, NewTerm, NewDict) }
end.
share_copy (Term, Dict) when is_list (Term) ->
{ NewTerm, NewDict } =
lists:foldl (fun (T, { Acc, D }) ->
{ TPrime, DPrime } = share (T, D),
{ [ TPrime | Acc ], DPrime }
end,
{ [], Dict },
Term),
{ lists:reverse (NewTerm), NewDict };
share_copy (Term, Dict) when is_tuple (Term) ->
{ ListTerm, NewDict } = share_copy (tuple_to_list (Term), Dict),
{ list_to_tuple (ListTerm), NewDict };
share_copy (Term, Dict) ->
{ Term, Dict }.
%-=====================================================================-
%- encoding, intended to be composed with term_to_binary
%-=====================================================================-
encode (Term) ->
{ Ref, Encoded, _, Codec } = encode (Term, dict:new (), dict:new ()),
{ Encoded, [ X || X <- dict:to_list (Codec), element (1, X) =/= Ref ] }.
encode (Term, Dict, Codec) ->
case dict:find (Term, Dict) of
{ ok, { Ref, OrigTerm } } ->
{ Ref, OrigTerm, Dict, Codec };
error ->
{ NewTerm, NewDict, NewCodec } = encode_copy (Term, Dict, Codec),
Ref = dict:size (NewDict),
{ Ref,
NewTerm,
dict:store (NewTerm, { Ref, NewTerm }, NewDict),
dict:store (Ref, NewTerm, NewCodec)
}
end.
encode_copy (Term, Dict, Codec) when is_list (Term) ->
{ NewTerm, NewDict, NewCodec } =
lists:foldl (fun (T, { Acc, D, C }) ->
{ Ref, _, DPrime, CPrime } = encode (T, D, C),
{ [ Ref | Acc ], DPrime, CPrime }
end,
{ [], Dict, Codec },
Term),
{ lists:reverse (NewTerm), NewDict, NewCodec };
encode_copy (Term, Dict, Codec) when is_tuple (Term) ->
{ ListTerm, NewDict, NewCodec } =
encode_copy (tuple_to_list (Term), Dict, Codec),
{ { t, ListTerm }, NewDict, NewCodec };
encode_copy (Term, Dict, Codec) when is_integer (Term) ->
{ { i, Term }, Dict, Codec };
encode_copy (Term, Dict, Codec) ->
{ Term, Dict, Codec }.
%-=====================================================================-
%- decoding, intended to be composed with binary_to_term
%-=====================================================================-
decode ({ Encoded, Codec }) ->
decode (Encoded, dict:from_list (Codec)).
decode (Encoded, Codec) when is_list (Encoded) ->
[ decode (X, Codec) || X <- Encoded ];
decode ({ t, Encoded }, Codec) when is_list (Encoded) ->
list_to_tuple (decode (Encoded, Codec));
decode ({ i, Literal }, _Codec) when is_integer (Literal) ->
Literal;
decode (Ref, Codec) when is_integer (Ref) ->
decode (dict:fetch (Ref, Codec), Codec);
decode (Literal, _Codec) ->
Literal.
%-=====================================================================-
%- Tests -
%-=====================================================================-
% ok, don't want to depend upon quickcheck, so here's some cheese
-define (FORALL (Var, Gen, Cond), fun (A) -> Var = (Gen) (A), Cond end).
flasscheck (N, Limit, P) -> flasscheck (1, N, math:log (Limit), P).
flasscheck (M, N, LogLimit, P) when M =< N ->
Size = trunc (math:exp (LogLimit * M / N)),
true = P (Size),
io:format (".", []),
flasscheck (M + 1, N, LogLimit, P);
flasscheck (_, N, _, _) ->
io:format ("~n~p tests passed~n", [ N ]),
ok.
random_integer () ->
random:uniform (100000).
random_float () ->
random:uniform ().
random_binary () ->
list_to_binary ([ random:uniform (255) || _ <- lists:seq (1, 5) ]).
random_atom () ->
list_to_atom ([ random:uniform ($z - $a) + $a || _ <- lists:seq (1, 5) ]).
random_list () ->
[ random_term () || _ <- lists:seq (1, 3) ].
random_tuple () ->
list_to_tuple (random_list ()).
random_term () ->
case random:uniform (10) of
1 -> random_integer ();
2 -> random_integer ();
3 -> random_float ();
4 -> random_float ();
5 -> random_binary ();
6 -> random_binary ();
7 -> random_atom ();
8 -> random_atom ();
9 -> random_list ();
10 -> random_tuple ()
end.
share_test_ () ->
F = fun () ->
T = ?FORALL (X,
fun (_) -> random_term () end,
(fun (Term) ->
Term = share (Term),
true
end) (X)),
ok = flasscheck (10000, 10, T)
end,
{ timeout, 60, F }.
codec_test_ () ->
F = fun () ->
T = ?FORALL (X,
fun (_) -> random_term () end,
(fun (Term) ->
Term = decode (encode (Term)),
true
end) (X)),
ok = flasscheck (10000, 10, T)
end,
{ timeout, 60, F }.
More information about the erlang-questions
mailing list