[erlang-questions] Extending term external format to support shared substructures
Matthew Dempsky
matthew@REDACTED
Tue Mar 31 06:37:08 CEST 2009
On Mon, Mar 30, 2009 at 6:19 PM, Matthew Dempsky <matthew@REDACTED> wrote:
> Unless anyone is strongly opposed to the idea, I'll work on a
> proof-of-concept patch.
Applying the patch below to R13A extends binary_to_term to support the
'D' and 'w' type tags as I described above. For example:
1> binary_to_term(<<131,$D,3:32, $k,5:16,"hello", $k,5:16,"world",
$h,2,$w,0:32,$w,1:32, $l,4:32,$w,2:32,$w,2:32,$w,2:32,$w,2:32,$j>>).
[{"hello","world"},
{"hello","world"},
{"hello","world"},
{"hello","world"}]
(This example uses a three element dictionary: the strings "hello" and
"world" are the first two words, the tuple {"hello", "world"} is the
third, using references to the dictionary instances; finally, the term
value is a length-4 list using 4 references to the dictionary tuple.)
The patch isn't entirely minimal; it also fixes a decoding problem for
zero length LIST_EXT structures, avoids allocating extra heap cells
for list structures, and refactors some of the integer unpacking to
use get_int16 and get_int32.
On a related note, I don't really understand why decoded_size keeps a
stack of values for 'terms'. It seems like it should just be possible
to keep a running grand-total rather than pushing and popping from a
stack.
--- erts/emulator/beam/external.h.orig 2009-03-30 19:35:43.000000000 -0700
+++ erts/emulator/beam/external.h 2009-03-30 19:37:30.000000000 -0700
@@ -50,6 +50,9 @@
#define NEW_CACHE 'N'
#define CACHED_ATOM 'C'
+#define DICTIONARY 'D'
+#define WORD_EXT 'w'
+
#define COMPRESSED 'P'
#define VERSION_MAGIC 131 /* 130 in erlang 4.2 */
--- erts/emulator/beam/external.c.orig 2009-03-30 19:35:39.000000000 -0700
+++ erts/emulator/beam/external.c 2009-03-30 21:26:33.000000000 -0700
@@ -76,6 +76,7 @@
static byte* enc_atom(DistEntry*, Eterm, byte*);
static byte* enc_pid(DistEntry*, Eterm, byte*);
static byte* dec_term(DistEntry*, Eterm**, byte*, ErlOffHeap*, Eterm*);
+static byte* dec_term1(DistEntry*, Eterm**, byte*, ErlOffHeap*,
Eterm*, Eterm*, Uint32);
static byte* dec_atom(DistEntry*, byte*, Eterm*);
static byte* dec_pid(DistEntry*, Eterm**, byte*, ErlOffHeap*, Eterm*);
static byte* dec_hashed_atom(DistEntry*, byte*, Eterm*);
@@ -988,6 +989,32 @@
static byte*
dec_term(DistEntry *dep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap,
Eterm* objp)
{
+ Uint32 i, n;
+ Eterm *dict;
+
+ if (*ep != DICTIONARY) {
+ return dec_term1(dep, hpp, ep, off_heap, objp, NULL, 0);
+ }
+
+ ++ep;
+ n = get_int32(ep);
+ ep += 4;
+ dict = erts_alloc(ERTS_ALC_T_TMP, sizeof(*dict) * n);
+ for (i = 0; i < n; ++i) {
+ if ((ep = dec_term1(dep, hpp, ep, off_heap, &dict[i], dict, i)) == NULL) {
+ break;
+ }
+ }
+ if (ep != NULL) {
+ ep = dec_term1(dep, hpp, ep, off_heap, objp, dict, n);
+ }
+ erts_free(ERTS_ALC_T_TMP, dict);
+ return ep;
+}
+
+static byte*
+dec_term1(DistEntry *dep, Eterm** hpp, byte* ep, ErlOffHeap*
off_heap, Eterm* objp, Eterm *dict, Uint32 dictsize)
+{
int n;
register Eterm* hp = *hpp; /* Please don't take the address of hp */
Eterm* next = objp;
@@ -1099,7 +1126,7 @@
n = get_int32(ep);
ep += 4;
if (n == 0) {
- *objp = NIL;
+ next = objp;
break;
}
*objp = make_list(hp);
@@ -1407,7 +1434,7 @@
goto error;
}
*hpp = hp;
- ep = dec_term(dep, hpp, ep, off_heap, &temp);
+ ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize);
hp = *hpp;
if (ep == NULL) {
return NULL;
@@ -1467,7 +1494,7 @@
module = temp;
/* Index */
- if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -1476,7 +1503,7 @@
old_index = unsigned_val(temp);
/* Uniq */
- if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -1559,7 +1586,7 @@
module = temp;
/* Index */
- if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -1568,7 +1595,7 @@
old_index = unsigned_val(temp);
/* Uniq */
- if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -1600,6 +1627,14 @@
}
break;
}
+ case WORD_EXT:
+ n = get_int32(ep);
+ ep += 4;
+ if (n < 0 || n >= dictsize) {
+ goto error;
+ }
+ *objp = dict[n];
+ break;
default:
error:
/*
@@ -1770,6 +1805,7 @@
int heap_size = 0;
int terms = 1;
int atom_extra_skip = 0;
+ int first = 1;
Uint n;
DECLARE_ESTACK(s);
@@ -1815,13 +1851,13 @@
break;
case LARGE_BIG_EXT:
CHKSIZE(4);
- n = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+ n = get_int32(ep);
SKIP2(n,4+1); /* skip, size,sign,digits */
heap_size += 1+1+(n+sizeof(Eterm)-1)/sizeof(Eterm); /* XXX: 1 too much? */
break;
case ATOM_EXT:
CHKSIZE(2);
- n = (ep[0] << 8) | ep[1];
+ n = get_int16(ep);
if (n > MAX_ATOM_LENGTH) {
return -1;
}
@@ -1885,10 +1921,10 @@
case LIST_EXT:
CHKSIZE(4);
ESTACK_PUSH(s, terms);
- terms = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
- terms++;
+ terms = get_int32(ep);
ep += 4;
heap_size += 2 * terms;
+ terms++;
break;
case SMALL_TUPLE_EXT:
CHKSIZE(1);
@@ -1899,13 +1935,13 @@
case LARGE_TUPLE_EXT:
CHKSIZE(4);
ESTACK_PUSH(s, terms);
- terms = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+ terms = get_int32(ep);
ep += 4;
heap_size += terms + 1;
break;
case STRING_EXT:
CHKSIZE(2);
- n = (ep[0] << 8) | ep[1];
+ n = get_int16(ep);
SKIP(n+2);
heap_size += 2 * n;
break;
@@ -1919,7 +1955,7 @@
break;
case BINARY_EXT:
CHKSIZE(4);
- n = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+ n = get_int32(ep);
SKIP2(n, 4);
if (n <= ERL_ONHEAP_BIN_LIMIT || no_refc_bins) {
heap_size += heap_bin_size(n);
@@ -1930,7 +1966,7 @@
case BIT_BINARY_EXT:
{
CHKSIZE(5);
- n = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+ n = get_int32(ep);
SKIP2(n, 5);
if (n <= ERL_ONHEAP_BIN_LIMIT || no_refc_bins) {
heap_size += heap_bin_size(n) + ERL_SUB_BIN_SIZE;
@@ -1965,9 +2001,23 @@
heap_size += ERL_FUN_SIZE + num_free;
break;
}
+ case DICTIONARY:
+ if (!first) {
+ return -1;
+ }
+ CHKSIZE(4);
+ ESTACK_PUSH(s, terms);
+ terms = get_int32(ep) + 1;
+ ep += 4;
+ break;
+ case WORD_EXT:
+ SKIP(4);
+ break;
default:
return -1;
}
+
+ first = 0;
}
if (ESTACK_ISEMPTY(s)) {
DESTROY_ESTACK(s);
More information about the erlang-questions
mailing list