[erlang-questions] Extending term external format to support shared substructures

Matthew Dempsky matthew@REDACTED
Tue Mar 31 06:37:08 CEST 2009


On Mon, Mar 30, 2009 at 6:19 PM, Matthew Dempsky <matthew@REDACTED> wrote:
> Unless anyone is strongly opposed to the idea, I'll work on a
> proof-of-concept patch.

Applying the patch below to R13A extends binary_to_term to support the
'D' and 'w' type tags as I described above.  For example:

1> binary_to_term(<<131,$D,3:32, $k,5:16,"hello", $k,5:16,"world",
$h,2,$w,0:32,$w,1:32, $l,4:32,$w,2:32,$w,2:32,$w,2:32,$w,2:32,$j>>).
[{"hello","world"},
 {"hello","world"},
 {"hello","world"},
 {"hello","world"}]

(This example uses a three element dictionary: the strings "hello" and
"world" are the first two words, the tuple {"hello", "world"} is the
third, using references to the dictionary instances; finally, the term
value is a length-4 list using 4 references to the dictionary tuple.)

The patch isn't entirely minimal; it also fixes a decoding problem for
zero length LIST_EXT structures, avoids allocating extra heap cells
for list structures, and refactors some of the integer unpacking to
use get_int16 and get_int32.

On a related note, I don't really understand why decoded_size keeps a
stack of values for 'terms'.  It seems like it should just be possible
to keep a running grand-total rather than pushing and popping from a
stack.

--- erts/emulator/beam/external.h.orig	2009-03-30 19:35:43.000000000 -0700
+++ erts/emulator/beam/external.h	2009-03-30 19:37:30.000000000 -0700
@@ -50,6 +50,9 @@
 #define NEW_CACHE         'N'
 #define CACHED_ATOM       'C'

+#define DICTIONARY        'D'
+#define WORD_EXT          'w'
+
 #define COMPRESSED        'P'

 #define VERSION_MAGIC 131   /* 130 in erlang 4.2 */
--- erts/emulator/beam/external.c.orig	2009-03-30 19:35:39.000000000 -0700
+++ erts/emulator/beam/external.c	2009-03-30 21:26:33.000000000 -0700
@@ -76,6 +76,7 @@
 static byte* enc_atom(DistEntry*, Eterm, byte*);
 static byte* enc_pid(DistEntry*, Eterm, byte*);
 static byte* dec_term(DistEntry*, Eterm**, byte*, ErlOffHeap*, Eterm*);
+static byte* dec_term1(DistEntry*, Eterm**, byte*, ErlOffHeap*,
Eterm*, Eterm*, Uint32);
 static byte* dec_atom(DistEntry*, byte*, Eterm*);
 static byte* dec_pid(DistEntry*, Eterm**, byte*, ErlOffHeap*, Eterm*);
 static byte* dec_hashed_atom(DistEntry*, byte*, Eterm*);
@@ -988,6 +989,32 @@
 static byte*
 dec_term(DistEntry *dep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap,
Eterm* objp)
 {
+    Uint32 i, n;
+    Eterm *dict;
+
+    if (*ep != DICTIONARY) {
+	return dec_term1(dep, hpp, ep, off_heap, objp, NULL, 0);
+    }
+
+    ++ep;
+    n = get_int32(ep);
+    ep += 4;
+    dict = erts_alloc(ERTS_ALC_T_TMP, sizeof(*dict) * n);
+    for (i = 0; i < n; ++i) {
+	if ((ep = dec_term1(dep, hpp, ep, off_heap, &dict[i], dict, i)) == NULL) {
+	    break;
+	}
+    }
+    if (ep != NULL) {
+	ep = dec_term1(dep, hpp, ep, off_heap, objp, dict, n);
+    }
+    erts_free(ERTS_ALC_T_TMP, dict);
+    return ep;
+}
+
+static byte*
+dec_term1(DistEntry *dep, Eterm** hpp, byte* ep, ErlOffHeap*
off_heap, Eterm* objp, Eterm *dict, Uint32 dictsize)
+{
     int n;
     register Eterm* hp = *hpp;	/* Please don't take the address of hp */
     Eterm* next = objp;
@@ -1099,7 +1126,7 @@
 	    n = get_int32(ep);
 	    ep += 4;
 	    if (n == 0) {
-		*objp = NIL;
+		next = objp;
 		break;
 	    }
 	    *objp = make_list(hp);
@@ -1407,7 +1434,7 @@
 		    goto error;
 		}
 		*hpp = hp;
-		ep = dec_term(dep, hpp, ep, off_heap, &temp);
+		ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize);
 		hp = *hpp;
 		if (ep == NULL) {
 		    return NULL;
@@ -1467,7 +1494,7 @@
 		module = temp;

 		/* Index */
-		if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -1476,7 +1503,7 @@
 		old_index = unsigned_val(temp);

 		/* Uniq */
-		if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -1559,7 +1586,7 @@
 		module = temp;

 		/* Index */
-		if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -1568,7 +1595,7 @@
 		old_index = unsigned_val(temp);

 		/* Uniq */
-		if ((ep = dec_term(dep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term1(dep, hpp, ep, off_heap, &temp, dict, dictsize))
== NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -1600,6 +1627,14 @@
 		}
 		break;
 	    }
+	case WORD_EXT:
+	    n = get_int32(ep);
+	    ep += 4;
+	    if (n < 0 || n >= dictsize) {
+		goto error;
+	    }
+	    *objp = dict[n];
+	    break;
 	default:
 	error:
 	    /*
@@ -1770,6 +1805,7 @@
     int heap_size = 0;
     int terms = 1;
     int atom_extra_skip = 0;
+    int first = 1;
     Uint n;
     DECLARE_ESTACK(s);

@@ -1815,13 +1851,13 @@
 		break;
 	    case LARGE_BIG_EXT:
 		CHKSIZE(4);
-		n = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+		n = get_int32(ep);
 		SKIP2(n,4+1);		/* skip, size,sign,digits */
 		heap_size += 1+1+(n+sizeof(Eterm)-1)/sizeof(Eterm); /* XXX: 1 too much? */
 		break;
 	    case ATOM_EXT:
 		CHKSIZE(2);
-		n = (ep[0] << 8) | ep[1];
+		n = get_int16(ep);
 		if (n > MAX_ATOM_LENGTH) {
 		    return -1;
 		}
@@ -1885,10 +1921,10 @@
 	    case LIST_EXT:
 		CHKSIZE(4);
 		ESTACK_PUSH(s, terms);
-		terms = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
-		terms++;
+		terms = get_int32(ep);
 		ep += 4;
 		heap_size += 2 * terms;
+		terms++;
 		break;
 	    case SMALL_TUPLE_EXT:
 		CHKSIZE(1);
@@ -1899,13 +1935,13 @@
 	    case LARGE_TUPLE_EXT:
 		CHKSIZE(4);
 		ESTACK_PUSH(s, terms);
-		terms = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+		terms = get_int32(ep);
 		ep += 4;
 		heap_size += terms + 1;
 		break;
 	    case STRING_EXT:
 		CHKSIZE(2);
-		n = (ep[0] << 8) | ep[1];
+		n = get_int16(ep);
 		SKIP(n+2);
 		heap_size += 2 * n;
 		break;
@@ -1919,7 +1955,7 @@
 		break;
 	    case BINARY_EXT:
 		CHKSIZE(4);
-		n = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+		n = get_int32(ep);
 		SKIP2(n, 4);
 		if (n <= ERL_ONHEAP_BIN_LIMIT || no_refc_bins) {
 		    heap_size += heap_bin_size(n);
@@ -1930,7 +1966,7 @@
 	    case BIT_BINARY_EXT:
 		{
 		    CHKSIZE(5);
-		    n = (ep[0] << 24) | (ep[1] << 16) | (ep[2] << 8) | ep[3];
+		    n = get_int32(ep);
 		    SKIP2(n, 5);
 		    if (n <= ERL_ONHEAP_BIN_LIMIT || no_refc_bins) {
 			heap_size += heap_bin_size(n) + ERL_SUB_BIN_SIZE;
@@ -1965,9 +2001,23 @@
 		    heap_size += ERL_FUN_SIZE + num_free;
 		    break;
 		}
+	    case DICTIONARY:
+		if (!first) {
+		    return -1;
+		}
+		CHKSIZE(4);
+		ESTACK_PUSH(s, terms);
+		terms = get_int32(ep) + 1;
+		ep += 4;
+		break;
+	    case WORD_EXT:
+		SKIP(4);
+		break;
 	    default:
 		return -1;
 	    }
+
+            first = 0;
 	}
 	if (ESTACK_ISEMPTY(s)) {
 	    DESTROY_ESTACK(s);



More information about the erlang-questions mailing list