[erlang-bugs] Erlang vm beam.smp crash

Sverker Eriksson sverker.eriksson@REDACTED
Wed Oct 1 15:19:09 CEST 2014


Thank you, 刘小飞

This is a race bug that has been there since R16B01. It's quite hard to hit,
as it requires race between socket port usage and port termination
while getting preempted by the OS at very precise code location.

A more complete fix will be included in 17.4, most probably looking like 
this:

diff --git a/erts/emulator/beam/erl_bif_port.c 
b/erts/emulator/beam/erl_bif_port.c
index 8a622e5..64bd598 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -493,8 +493,8 @@ void
  erts_cleanup_port_data(Port *prt)
  {
      ASSERT(erts_atomic32_read_nob(&prt->state) & 
ERTS_PORT_SFLGS_INVALID_LOOKUP);
- cleanup_old_port_data(erts_smp_atomic_read_nob(&prt->data));
-    erts_smp_atomic_set_nob(&prt->data, (erts_aint_t) THE_NON_VALUE);
+ cleanup_old_port_data(erts_smp_atomic_xchg_nob(&prt->data,
+                                                  (erts_aint_t) NULL));
  }

  Uint
@@ -562,8 +562,14 @@ BIF_RETTYPE port_set_data_2(BIF_ALIST_2)

      data = erts_smp_atomic_xchg_wb(&prt->data, data);

+    if (data == (erts_aint_t)NULL) {
+       /* Port terminated by racing thread */
+       data = erts_smp_atomic_xchg_wb(&prt->data, data);
+       ASSERT(data != (erts_aint_t)NULL);
+       cleanup_old_port_data(data);
+       BIF_ERROR(BIF_P, BADARG);
+    }
      cleanup_old_port_data(data);
-
      BIF_RET(am_true);
  }

@@ -582,6 +588,8 @@ BIF_RETTYPE port_get_data_1(BIF_ALIST_1)
          BIF_ERROR(BIF_P, BADARG);

      data = erts_smp_atomic_read_ddrb(&prt->data);
+    if (data == (erts_aint_t)NULL)
+        BIF_ERROR(BIF_P, BADARG);  /* Port terminated by racing thread */

      if ((data & 0x3) != 0) {
         res = (Eterm) (UWord) data;


/Sverker, Erlang/OTP


On 09/29/2014 12:22 PM, 刘小飞 wrote:
> I use http://www.erlang.org/download/otp_src_17.0.tar.gz to build the erlang.
>
> BIF_RETTYPE port_get_data_1(BIF_ALIST_1)
> {
>      /*
>       * This is not a signal. See comment above.
>       */
>      Eterm res;
>      erts_aint_t data;
>      Port* prt;
>
>      prt = data_lookup_port(BIF_P, BIF_ARG_1);
>      if (!prt)
>          BIF_ERROR(BIF_P, BADARG);
>
>      data = erts_smp_atomic_read_ddrb(&prt->data);
>      if (!data)
>          BIF_ERROR(BIF_P, BADARG);      //I add the two lines to correct it.
>
>      if ((data & 0x3) != 0) {
>      res = (Eterm) (UWord) data;
>      ASSERT(is_immed(res));
>      }
>      else {
>      ErtsPortDataHeap *pdhp = (ErtsPortDataHeap *) data;
>      Eterm *hp = HAlloc(BIF_P, pdhp->hsize);
>      res = copy_struct(pdhp->data, pdhp->hsize, &hp, &MSO(BIF_P));
>      }
>
>      BIF_RET(res);
> }
>
>
> (gdb) bt full
> #0  0x0000000000514524 in port_get_data_1 (A__p=0x7f4bc0d66488, BIF__ARGS=<value optimized out>) at beam/erl_bif_port.c:591
>          pdhp = 0x0
>          hp = <value optimized out>
>          data = 0
> #1  0x000000000054d517 in process_main () at beam/beam_emu.c:2787
>          bf = 0x514490 <port_get_data_1>
>          result = 1688368833101607
>          init_done = 1
>          c_p = 0x7f4bc0d66488
>          reds_used = 178536832
>          x0 = 1688368833101607
>          reg = 0x7f4c0aa44180
>          HTOP = 0x7f4bc036d350
>          E = 0x7f4bc0370b18
>          I = 0x7f4bfb5c7af8
>          FCALLS = 1984
>          tmp_arg1 = 139963324058344
>          tmp_arg2 = 15
>          tmp_big = {139964436718400, 5662828}
>          freg = 0x7f4c0aa461c0
>          neg_o_reds = 0
>          arith_func = 0
>          opcodes = {0x54c14a, 0x54b78e, 0x54c06a, 0x54c0eb, 0x54c2f8, 0x54cee5, 0x54cb67, 0x54e173, 0x54ec5c, 0x54ca4d, 0x54ca43, 0x54ca23, 0x54908b, 0x54c5fe, 0x54d5c8, 0x54d605, 0x54d5f6, 0x54957d, 0x549451, 0x54d366, 0x54d26d,
>            0x54d29b, 0x54d063, 0x54d092, 0x54d245, 0x54d223, 0x54d176, 0x54d4bb, 0x54ca52, 0x54caa1, 0x54ba31, 0x54ba07, 0x54ba26, 0x54bf45, 0x54bf66, 0x54ccfb, 0x54c949, 0x546667, 0x54ca9c, 0x54674e, 0x546771, 0x54667a, 0x5466ab,
>            0x5466dc, 0x546715, 0x5464cd, 0x54eaa8, 0x54e14e, 0x54ea52, 0x54eb16, 0x546795, 0x5467b6, 0x5467d8, 0x5467f5, 0x546823, 0x546852, 0x546870, 0x54689f, 0x5468cf, 0x5468fc, 0x54692a, 0x546957, 0x54699e, 0x5469e6, 0x546a14,
>            0x546a5c, 0x546aa5, 0x546ad3, 0x546b02, 0x546b30, 0x546b78, 0x546bc1, 0x546bf0, 0x546c39, 0x54d3bf, 0x54b2ce, 0x54b39e, 0x54b3c9, 0x54e468, 0x54b3bf, 0x54b5f8, 0x54e1d9, 0x54b046, 0x54b0ac, 0x54b65f, 0x54ce89, 0x54b47a,
>            0x54d44b, 0x54e23b, 0x54b4bd, 0x5493b7, 0x54947d, 0x54ddf7, 0x54df22, 0x54e011, 0x54d8dd, 0x54d60f, 0x54d694, 0x54d965, 0x54d9db, 0x54da59, 0x54db67, 0x54d718, 0x54d348, 0x54d2c2, 0x54d340, 0x54b103, 0x54d34d, 0x54d1a0,
>            0x54d8d6, 0x54d857, 0x54d8b7, 0x54d6ad, 0x54d57d, 0x54d5ba, 0x54d810, 0x54d849, 0x549430, 0x54bbbe, 0x54e2f0, 0x54d38c, 0x54e331, 0x54e345, 0x54bab9, 0x54e357, 0x549185, 0x54e3c9, 0x54e3e9, 0x5494f1, 0x549306, 0x549515,
>            0x54d6a5, 0x54bbc9, 0x54953a, 0x54d0ee, 0x54d0b2, 0x54df04, 0x54ddd1, 0x54bc6c, 0x54dede, 0x54dd60, 0x54dc7d, 0x54dcef, 0x54af53, 0x54afcc, 0x54e7ca, 0x54e7ff, 0x54e294, 0x54e2be, 0x54b6ca, 0x54d355, 0x54d211, 0x54cf47,
>            0x54cf9a, 0x54d007, 0x54d12e, 0x54ec20, 0x54b559, 0x54b507, 0x546539, 0x54656a, 0x5465a3, 0x5465e0, 0x54e0ea, 0x54e400, 0x546504, 0x5464e2, 0x54ea08, 0x54be99, 0x54c53b, 0x54c5d7, 0x54e774, 0x54bee0, 0x54bf28, 0x54bf36,
>            0x5464cd, 0x54b222, 0x546d1f, 0x546d3e, 0x546d81, 0x546da0, 0x546dd2, 0x546e29, 0x546e49, 0x546e7c, 0x546d04, 0x546d5e, 0x546e05, 0x546ca2, 0x546cbd, 0x546ce0, 0x546c83, 0x54e83c, 0x54b1cc, 0x54b278, 0x54eb5b, 0x54ba46,
>            0x54c683, 0x54c726, 0x54c7b4...}
>          temp_bits = 139964436760704
>          pt_arity = 139963334550664
>          start_time = 0
>          start_time_i = 0x0
>          EBS = 0x7f4c0288c898
> #2  0x00000000004a081b in sched_thread_func (vesdp=0x7f4c0288c880) at beam/erl_process.c:7665
>          callbacks = {arg = 0x7f4c02882380, wakeup = 0x4a21b0 <thr_prgr_wakeup>, prepare_wait = 0x49e370 <thr_prgr_prep_wait>, wait = 0x49f6f0 <thr_prgr_wait>, finalize_wait = 0x49e350 <thr_prgr_fin_wait>}
>          esdp = 0x7f4c0288c880
>          no = 2
> #3  0x00000000005df676 in thr_wrapper (vtwd=<value optimized out>) at pthread/ethread.c:110
>          result = <value optimized out>
>          res = 0x7fffd82d4b90
>          twd = <value optimized out>
>          thr_func = 0x4a0700 <sched_thread_func>
>          arg = 0x7f4c0288c880
>          tsep = 0x7f4c0a2800a0
> #4  0x00000037d10079d1 in start_thread () from /lib64/libpthread.so.0
> No symbol table info available.
> #5  0x00000037d0ce8b6d in ?? ()
> No symbol table info available.
> #6  0x0000000000000000 in ?? ()
> No symbol table info available.
>
>
>
> _______________________________________________
> erlang-bugs mailing list
> erlang-bugs@REDACTED
> http://erlang.org/mailman/listinfo/erlang-bugs

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://erlang.org/pipermail/erlang-bugs/attachments/20141001/67201171/attachment.htm>


More information about the erlang-bugs mailing list