[erlang-questions] Re: data sharing is outside the semantics of Erlang, but it sure is useful

Paul Mineiro paul-trapexit@REDACTED
Tue Sep 15 23:25:19 CEST 2009


i use erlang:term_to_binary to persist items in databases all the time, so
this thread caught my attention.

to stimulate discussion i pounded out some code that does hash consing
(hashcons:share/1) and that does encoding and decoding suitable for
composition with erlang:term_to_binary/2 (hashcons:encode/1) and
erlang:binary_to_term/1 (hashcons:decode/1).  encoded terms don't
necessarily have a smaller wire footprint even in a very favorable case
(see below), but should decode into a shared data structure (as opposed to
calling hashcons:share/1 after calling erlang:binary_to_term/1, which
could create a temporary which is unacceptably huge).

attached.

-- p

% erl
Erlang (BEAM) emulator version 5.6.5 [source] [async-threads:0] [kernel-poll:false]

Eshell V5.6.5  (abort with ^G)
1> hashcons:encode (lists:duplicate (10, wazzup)).
{[0,0,0,0,0,0,0,0,0,0],[{0,wazzup}]}
2> size (erlang:term_to_binary (hashcons:encode (lists:duplicate (10, wazzup)), [ compressed ])).
35
3> size (erlang:term_to_binary (lists:duplicate (10, wazzup), [ compressed ])).
31
4> % not always smaller
4> size (erlang:term_to_binary (hashcons:encode (lists:duplicate (100, wazzup)), [ compressed ])).
38
5> size (erlang:term_to_binary (lists:duplicate (100, wazzup), [ compressed ])).
37
-------------- next part --------------
-module (hashcons).
-export ([ share/1,
           encode/1,
           decode/1 ]).

-include_lib ("eunit/include/eunit.hrl").

%-=====================================================================-
%-                             hash consing                            -
%-=====================================================================-

share (Term) ->
  element (1, share (Term, dict:new ())).

share (Term, Dict) ->
  case dict:find (Term, Dict) of
    { ok, OrigTerm } ->
      { OrigTerm, Dict };
    error ->
      { NewTerm, NewDict } = share_copy (Term, Dict),
      { NewTerm, dict:store (NewTerm, NewTerm, NewDict) }
  end.

share_copy (Term, Dict) when is_list (Term) ->
  { NewTerm, NewDict } = 
    lists:foldl (fun (T, { Acc, D }) -> 
                  { TPrime, DPrime } = share (T, D),
                  { [ TPrime | Acc ], DPrime }
                 end,
                 { [], Dict },
                 Term),
  { lists:reverse (NewTerm), NewDict };
share_copy (Term, Dict) when is_tuple (Term) ->
  { ListTerm, NewDict } = share_copy (tuple_to_list (Term), Dict),
  { list_to_tuple (ListTerm), NewDict };
share_copy (Term, Dict) ->
  { Term, Dict }.

%-=====================================================================-
%-      encoding, intended to be composed with term_to_binary
%-=====================================================================-

encode (Term) ->
  { Ref, Encoded, _, Codec } = encode (Term, dict:new (), dict:new ()),
  { Encoded, [ X || X <- dict:to_list (Codec), element (1, X) =/= Ref ] }.

encode (Term, Dict, Codec) ->
  case dict:find (Term, Dict) of
    { ok, { Ref, OrigTerm } } ->
      { Ref, OrigTerm, Dict, Codec };
    error ->
      { NewTerm, NewDict, NewCodec } = encode_copy (Term, Dict, Codec),
      Ref = dict:size (NewDict),
      { Ref, 
        NewTerm, 
        dict:store (NewTerm, { Ref, NewTerm }, NewDict),
        dict:store (Ref, NewTerm, NewCodec)
      }
  end.

encode_copy (Term, Dict, Codec) when is_list (Term) ->
  { NewTerm, NewDict, NewCodec } = 
    lists:foldl (fun (T, { Acc, D, C }) -> 
                  { Ref, _, DPrime, CPrime } = encode (T, D, C),
                  { [ Ref | Acc ], DPrime, CPrime }
                 end,
                 { [], Dict, Codec },
                 Term),
  { lists:reverse (NewTerm), NewDict, NewCodec };
encode_copy (Term, Dict, Codec) when is_tuple (Term) ->
  { ListTerm, NewDict, NewCodec } = 
    encode_copy (tuple_to_list (Term), Dict, Codec),
  { { t, ListTerm }, NewDict, NewCodec };
encode_copy (Term, Dict, Codec) when is_integer (Term) ->
  { { i, Term }, Dict, Codec };
encode_copy (Term, Dict, Codec) ->
  { Term, Dict, Codec }.

%-=====================================================================-
%-      decoding, intended to be composed with binary_to_term
%-=====================================================================-

decode ({ Encoded, Codec }) ->
  decode (Encoded, dict:from_list (Codec)).

decode (Encoded, Codec) when is_list (Encoded) ->
  [ decode (X, Codec) || X <- Encoded ];
decode ({ t, Encoded }, Codec) when is_list (Encoded) ->
  list_to_tuple (decode (Encoded, Codec));
decode ({ i, Literal }, _Codec) when is_integer (Literal) ->
  Literal;
decode (Ref, Codec) when is_integer (Ref) ->
  decode (dict:fetch (Ref, Codec), Codec);
decode (Literal, _Codec) ->
  Literal.

%-=====================================================================-
%-                                Tests                                -
%-=====================================================================-

% ok, don't want to depend upon quickcheck, so here's some cheese

-define (FORALL (Var, Gen, Cond), fun (A) -> Var = (Gen) (A), Cond end).

flasscheck (N, Limit, P) -> flasscheck (1, N, math:log (Limit), P).

flasscheck (M, N, LogLimit, P) when M =< N -> 
  Size = trunc (math:exp (LogLimit * M / N)),
  true = P (Size),
  io:format (".", []),
  flasscheck (M + 1, N, LogLimit, P);
flasscheck (_, N, _, _) -> 
  io:format ("~n~p tests passed~n", [ N ]),
  ok.

random_integer () ->
  random:uniform (100000).

random_float () ->
  random:uniform ().

random_binary () ->
  list_to_binary ([ random:uniform (255) || _ <- lists:seq (1, 5) ]).

random_atom () ->
  list_to_atom ([ random:uniform ($z - $a) + $a || _ <- lists:seq (1, 5) ]).

random_list () ->
  [ random_term () || _ <- lists:seq (1, 3) ].

random_tuple () ->
  list_to_tuple (random_list ()).

random_term () ->
  case random:uniform (10) of
    1 -> random_integer ();
    2 -> random_integer ();
    3 -> random_float ();
    4 -> random_float ();
    5 -> random_binary ();
    6 -> random_binary ();
    7 -> random_atom ();
    8 -> random_atom ();
    9 -> random_list ();
    10 -> random_tuple ()
  end.

share_test_ () ->
  F = fun () ->
    T = ?FORALL (X,
                 fun (_) -> random_term () end,
                 (fun (Term) ->
                    Term = share (Term),
                    true
                  end) (X)),

    ok = flasscheck (10000, 10, T)
  end,

  { timeout, 60, F }.

codec_test_ () ->
  F = fun () ->
    T = ?FORALL (X,
                 fun (_) -> random_term () end,
                 (fun (Term) ->
                    Term = decode (encode (Term)),
                    true
                  end) (X)),

    ok = flasscheck (10000, 10, T)
  end,

  { timeout, 60, F }.


More information about the erlang-questions mailing list