String representation in erlang

Thinus Pollard thinus@REDACTED
Tue Sep 13 12:10:58 CEST 2005


Hi there

According to the Erlang efficiency guide a string is internally represented as 
a list of integers, thus consuming 2 words (8 bytes on a 32bit platform) of 
memory *per* character.

The attached code is an attempt at reducing the memory footprint of strings in 
erlang (passing between functions etc etc).

The basic idea is to pack a string into n byte sized integers and unpacking 
them on the other side. The text file called compare.txt also shows the 
memory needed to represent strings in normal erlang strings and this string 
packing.

Normal erlang strings are 2 words/character. The packed representation uses 1 
word of memory per list element plus n bytes/wordsize per integer element, 
where every integer element contain n characters.

Deficiencies:
If the string length is not divisible by n, space is wasted (the string gets 
padded with zeros). 

Usage:
Pick your the integer representation length.
packstring/1 takes a string returns a list of n byte integers
unpackstring/1 takes an integer representation and returns a string.

There is a simple test suite in test/0.

If anyone can improve upon this code, please do. If this was an exercise in 
futility, please let my know, I've only been programming erlang for 2 weeks 
and still need to learn all the gotchas ;)

-- 

Thinus Pollard

Mobile: +27 72 075 2751
-------------- next part --------------
Comparison of erlang strings vs packed strings. Left hand column is the
string length, second column is the bytes erlang uses to represent that
string, third - ninth column is the bytes needed to represent the packed
string. pack[n] refers to a packed string using n byte integers to store
the string.

Chars	erlang	pack4	pack8	pack12	pack16	pack20	pack24	pack32
	bytes	bytes	bytes	bytes	bytes	bytes	bytes	bytes
0	0	0	0	0	0	0	0	0
1	8	8	12	16	20	24	28	36
2	16	8	12	16	20	24	28	36
3	24	8	12	16	20	24	28	36
4	32	8	12	16	20	24	28	36
5	40	16	12	16	20	24	28	36
6	48	16	12	16	20	24	28	36
7	56	16	12	16	20	24	28	36
8	64	16	12	16	20	24	28	36
9	72	24	24	16	20	24	28	36
10	80	24	24	16	20	24	28	36
11	88	24	24	16	20	24	28	36
12	96	24	24	16	20	24	28	36
13	104	32	24	32	20	24	28	36
14	112	32	24	32	20	24	28	36
15	120	32	24	32	20	24	28	36
16	128	32	24	32	20	24	28	36
17	136	40	36	32	40	24	28	36
18	144	40	36	32	40	24	28	36
19	152	40	36	32	40	24	28	36
20	160	40	36	32	40	24	28	36
21	168	48	36	32	40	48	28	36
22	176	48	36	32	40	48	28	36
23	184	48	36	32	40	48	28	36
24	192	48	36	32	40	48	28	36
25	200	56	48	48	40	48	56	36
26	208	56	48	48	40	48	56	36
27	216	56	48	48	40	48	56	36
28	224	56	48	48	40	48	56	36
29	232	64	48	48	40	48	56	36
30	240	64	48	48	40	48	56	36
31	248	64	48	48	40	48	56	36
32	256	64	48	48	40	48	56	36
33	264	72	60	48	60	48	56	72
34	272	72	60	48	60	48	56	72
35	280	72	60	48	60	48	56	72
36	288	72	60	48	60	48	56	72
37	296	80	60	64	60	48	56	72
38	304	80	60	64	60	48	56	72
39	312	80	60	64	60	48	56	72
40	320	80	60	64	60	48	56	72
41	328	88	72	64	60	72	56	72
42	336	88	72	64	60	72	56	72
43	344	88	72	64	60	72	56	72
44	352	88	72	64	60	72	56	72
45	360	96	72	64	60	72	56	72
46	368	96	72	64	60	72	56	72
47	376	96	72	64	60	72	56	72
48	384	96	72	64	60	72	56	72
49	392	104	84	80	80	72	84	72
50	400	104	84	80	80	72	84	72
51	408	104	84	80	80	72	84	72
52	416	104	84	80	80	72	84	72
53	424	112	84	80	80	72	84	72
54	432	112	84	80	80	72	84	72
55	440	112	84	80	80	72	84	72
56	448	112	84	80	80	72	84	72
57	456	120	96	80	80	72	84	72
58	464	120	96	80	80	72	84	72
59	472	120	96	80	80	72	84	72
60	480	120	96	80	80	72	84	72
61	488	128	96	96	80	96	84	72
62	496	128	96	96	80	96	84	72
63	504	128	96	96	80	96	84	72
64	512	128	96	96	80	96	84	72
65	520	136	108	96	100	96	84	108
66	528	136	108	96	100	96	84	108
67	536	136	108	96	100	96	84	108
68	544	136	108	96	100	96	84	108
69	552	144	108	96	100	96	84	108
70	560	144	108	96	100	96	84	108
71	568	144	108	96	100	96	84	108
72	576	144	108	96	100	96	84	108
73	584	152	120	112	100	96	112	108
74	592	152	120	112	100	96	112	108
75	600	152	120	112	100	96	112	108
76	608	152	120	112	100	96	112	108
77	616	160	120	112	100	96	112	108
78	624	160	120	112	100	96	112	108
79	632	160	120	112	100	96	112	108
80	640	160	120	112	100	96	112	108
81	648	168	132	112	120	120	112	108
82	656	168	132	112	120	120	112	108
83	664	168	132	112	120	120	112	108
84	672	168	132	112	120	120	112	108
85	680	176	132	128	120	120	112	108
86	688	176	132	128	120	120	112	108
87	696	176	132	128	120	120	112	108
88	704	176	132	128	120	120	112	108
89	712	184	144	128	120	120	112	108
90	720	184	144	128	120	120	112	108
91	728	184	144	128	120	120	112	108
92	736	184	144	128	120	120	112	108
93	744	192	144	128	120	120	112	108
94	752	192	144	128	120	120	112	108
95	760	192	144	128	120	120	112	108
96	768	192	144	128	120	120	112	108
97	776	200	156	144	140	120	140	144
98	784	200	156	144	140	120	140	144
99	792	200	156	144	140	120	140	144
100	800	200	156	144	140	120	140	144
-------------- next part --------------
%%%-------------------------------------------------------------------
%%% File    : packer.erl
%%% Author  : Thinus Pollard <thinus@REDACTED>
%%% Description : Pack erlang strings (8 bytes/char according to the
%%%               erlang efficiency guide) into a list of integers
%%%               (1 byte / char).
%%%
%%% Created : 12 Sep 2005 by Thinus Pollard <thinus@REDACTED>
%%%-------------------------------------------------------------------
-module(packer).

%% define the size of the to use in bytes
-define(BYTES, 32).

-define(BITS, (?BYTES * 8)).

%% API
-export([packstring/1,unpackstring/1]).
-export([test/0]).

%%====================================================================
%% API
%%====================================================================
%%--------------------------------------------------------------------
%% Function: packstring/1
%% Description: Takes Erlang String and returns a list of integers of
%%              size BYTES containing this String.
%%--------------------------------------------------------------------
packstring(String) ->
    packstring(String, []).

%%--------------------------------------------------------------------
%% Function: unpackstring/1
%% Description: Takes List of BYTES sized integers and returns the
%%              repesented string.
%%--------------------------------------------------------------------
unpackstring(List) ->
    unpackstring(List, []).

%%--------------------------------------------------------------------
%% Function: test/0
%% Description: Takes List of BYTES sized integers and returns the
%%              repesented string.
%%--------------------------------------------------------------------
test() ->
    test("", "Empty string"),
    test("T", "Single character string"),
    test("This is a String to be packed", "'Normal' sized string containing spaces"),
    test("0123456789abcdefghijklmnopqrstuvwxyz", "'Normal' sized string without spaces"),
    test("000000000011111111112222222222333333333344444444445555555555"
         "6666666666777777777788888888889999999999aaaaaaaaaabbbbbbbbbb"
         "ccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhh"
         "iiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmmmmnnnnnnnnnn"
         "ooooooooooppppppppppqqqqqqqqqqrrrrrrrrrrsssssssssstttttttttt"
         "uuuuuuuuuuvvvvvvvvvvwwwwwwwwwwxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz", "Longish string without spaces").
    
%%====================================================================
%% Internal functions
%%====================================================================

%%--------------------------------------------------------------------
%% Function: packstring/2
%% Description: Takes Erlang String and returns a list of integers of
%%              size BYTES containing this String.
%%--------------------------------------------------------------------
packstring([], Res)->
    lists:reverse(Res);
packstring(String, Res) ->
    case string:len(String) > ?BYTES - 1 of
	 true -> %% at least BYTES characters left
	    Working = string:substr(String, 1, ?BYTES),
	    WC = list_to_binary(Working),
	    <<WB:?BITS>> = WC,
	    packstring(string:substr(String, ?BYTES + 1), [WB|Res]);
	false -> %% we need to zero pad the remaining string to BYTES characters
	    String2 = lists:append(String, lists:duplicate(?BYTES - string:len(String), 0)),
	    packstring(String2, Res)
    end.

%%--------------------------------------------------------------------
%% Function: unpackstring/2
%% Description: Takes List of BYTES sized integers and returns the
%%              repesented string.
%%--------------------------------------------------------------------
unpackstring([], Res) ->
    %% drop the padded zeros (if any)
    Fun1 = fun (X) ->
		   X /= 0
	   end,
    Res1 = lists:filter(Fun1, Res),
    lists:reverse(Res1);
unpackstring([H|T], Res) ->
    %% take integers 1 by 1 and decode
    R = buildBin(H, ?BITS, []),
    unpackstring(T, R ++ Res).

%%--------------------------------------------------------------------
%% Function: buildBin/3
%% Description: Takes a binary, number of bits representing that binary
%%              and a result list. Returns a list containing the binary
%%              broken into 8bit chunks
%%--------------------------------------------------------------------
buildBin(_Bin, 0, Res) ->
    lists:reverse(Res);
buildBin(Bin, Bits, Res) ->
    Bits2 = Bits - 8,
    <<A:8, B:Bits2>> = <<Bin:Bits>>,
    Res2 = Res ++ binary_to_list(<<A>>),
    buildBin(B, Bits - 8, Res2).

%%--------------------------------------------------------------------
%% Function: test/2
%% Description: Test suite: encodes a string, decodes it and compares the
%%              original string with the decoded string.
%%--------------------------------------------------------------------
test(String, Desc) ->
    R = packstring(String),
    Size = (length(R) * 4) + (?BYTES * length(R)),
    StringL = string:len(String),
    S = unpackstring(R, []),
    error_logger:info_msg("Test string description: ~p~n"
			  "Original string: ~p~n"
			  "Packing string into list of ~p byte sized integers~n"
			  "Packed string: ~p~n"
			  "Unpacked string: ~p~n"
			  "Strings match: ~p~n"
			  "-----~n"
			  "Stats~n"
			  "-----~n"
			  "String length: ~p~n"
			  "Size of erlang string (bytes): ~p~n"
			  "Size of packed string (bytes): ~p~n", [Desc, String, ?BYTES, R, S, String == S, StringL, StringL * 8, Size]).


More information about the erlang-questions mailing list