R13B01 bug in file:pwrite()?

Scott Lystig Fritchie fritchie@REDACTED
Fri Jul 31 06:18:30 CEST 2009


Hi, all.  This sounds really weird, but I have a test case that can
irregularly trigger this bug within 5-10 minutes.  I can't post all the
code, in part because it's driven by Quviq's QuickCheck.  :-)  I'll try
to cobble together as much info as I can gather, for such an
intermittent problem.

100% of the time, I see crashes after the "Use Scribble..." message.
The error could be happening perhaps after the func has returned, but
100% of the time there isn't any output following "Use Scribble...", and
the surrounding code is fairly verbose.

scribbles(ArgsList, PathT) ->
    io:format("Num scribbles = ~p\n", [length(ArgsList)]),
    lists:map(
      fun([SeqNum, Offset, NewBlob]) ->
	      Path = element((SeqNum rem size(PathT)) + 1, PathT),
	      {ok, FI} = file:read_file_info(Path),
	      Pos = Offset rem (FI#file_info.size + 1), % in case size = 0.
	      {ok, FH} = file:open(Path, [binary, read, write]),
	      io:format("Pread ~p at Pos ~p for ~p bytes\n", [Path, Pos, size(NewBlob)]),
	      OldBlob = case (catch file:pread(FH, Pos, size(NewBlob))) of
			    {ok, BLB} -> BLB;
			    _         -> cant_match_any_binary_ha_ha
			end,
	      io:format("Use Scribble ~p at ~p Pos ~p\n", [NewBlob, Path, Pos]),
	      ok = file:pwrite(FH, Pos, NewBlob),
	      file:close(FH),
	      OldBlob == NewBlob
      end, ArgsList).

I'm using R13B01 on a 32-bit install of a Linux Fedora 11 distro,
2.6.27.25-170.2.72.fc10.i686 kernel.  I've seen this happen with SMP
support enabled or disabled, with the +A worker pool enabled or
disabled, with +K true or false.

  USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
  fritchie 12106 77.8 45.8 1716800 1661460 pts/22 Sl+ 22:22   6:10 /tmp/R13B01-debug/lib/erlang/erts-5.7.2/bin/beam.smp -K true -- -root /tmp/R13B01-debug/lib/erlang -progname erl -- -home /home/fritchie -smp enable -sname foo_dev -boot foo -pz ... [many more -pz flags...]

(I had "ps u" running in a loop 4x per second.  The above was taken
during the ERTS_HOLE_MARKER output below.  The reading 0.25 seconds
earlier was VSZ of about 70MB.)

The above is a debug build, as best as I've been able to do.  (It isn't
one-command easy, alas.)

  Erlang R13B01 (erts-5.7.2) [source] [smp:2:2] [rq:2] [async-threads:0] [hipe] [kernel-poll:true] [type-assertions] [debug-compiled] [lock-checking]

Sample output after the crash looks like this.  Some messages are
lacking the "Num scribbles..." output because I'd added that one a bit
later.

  Use Scribble <<"ff">> at "./zzz-hlog-qc/2/6/-000000000061.HLOG" Pos 10172
  Pread "./zzz-hlog-qc/2/2/-000000000085.HLOG" at Pos 64 for 27 bytes
  Use Scribble <<"RRRRRRRRRRRRRRRRRRRRRRRRRRR">> at "./zzz-hlog-qc/2/2/-000000000085.HLOG" Pos 64
  Pread "./zzz-hlog-qc/2/6/-000000000061.HLOG" at Pos 1162 for 3 bytes
  Use Scribble <<"¸¸¸">> at "./zzz-hlog-qc/2/6/-000000000061.HLOG" Pos 1162
  Pread "./zzz-hlog-qc/000000000084.HLOG" at Pos 652 for 22 bytes
  Use Scribble <<"NNNNNNNNNNNNNNNNNNNNNN">> at "./zzz-hlog-qc/000000000084.HLOG" Pos 652
  Pread "./zzz-hlog-qc/000000000046.HLOG" at Pos 653 for 8 bytes
  Use Scribble <<"ìììììììì">> at "./zzz-hlog-qc/000000000046.HLOG" Pos 653
  make: *** [run-app1-interactive] Segmentation fault

  .....Pread "./zzz-hlog-qc/000000000001.HLOG" at Pos 216 for 1 bytes
  Use Scribble <<"Z">> at "./zzz-hlog-qc/000000000001.HLOG" Pos 216
  Pread "./zzz-hlog-qc/000000000001.HLOG" at Pos 330 for 1 bytes
  Use Scribble <<"Ö">> at "./zzz-hlog-qc/000000000001.HLOG" Pos 330
  Pread "./zzz-hlog-qc/000000000001.HLOG" at Pos 83 for 1 bytes
  Use Scribble <<"°">> at "./zzz-hlog-qc/000000000001.HLOG" Pos 83
  Pread "./zzz-hlog-qc/000000000001.HLOG" at Pos 176 for 1 bytes
  Use Scribble <<"?">> at "./zzz-hlog-qc/000000000001.HLOG" Pos 176

  Crash dump was written to: erl_crash.dump
  eheap_alloc: Cannot allocate 2147483700 bytes of memory (of type "heap_frag").
  make: *** [run-app1-interactive] Aborted

----

  Pread "./zzz-hlog-qc/000000000001.HLOG" at Pos 443 for 0 bytes
  Use Scribble <<>> at "./zzz-hlog-qc/000000000001.HLOG" Pos 443
  Pread "./zzz-hlog-qc/000000000065.HLOG" at Pos 2496 for 21 bytes
  Use Scribble <<"ooooooooooooooooooooo">> at "./zzz-hlog-qc/000000000065.HLOG" Pos 2496
  Pread "./zzz-hlog-qc/2/2/-000000000064.HLOG" at Pos 70 for 16 bytes
  Use Scribble <<"´´´´´´´´´´´´´´´´">> at "./zzz-hlog-qc/2/2/-000000000064.HLOG" Pos 70
  Pread "./zzz-hlog-qc/3/6/-000000000047.HLOG" at Pos 6226 for 4 bytes
  Use Scribble <<"ºººº">> at "./zzz-hlog-qc/3/6/-000000000047.HLOG" Pos 6226
  make: *** [run-app1-interactive] Segmentation fault

----

  .
  =GMT ERR REPORT==== 30-Jul-2009::21:05:45 ===
  fold_a_file: seq -2 offset 0: {error,unknown_file_header,
                                       {ok,<<"LogVersion1L">>}}
  .Pread "./zzz-hlog-qc/3/3/-000000000002.HLOG" at Pos 115 for 1 bytes
  Use Scribble <<"·">> at "./zzz-hlog-qc/3/3/-000000000002.HLOG" Pos 115
  Pread "./zzz-hlog-qc/3/3/-000000000002.HLOG" at Pos 127 for 1 bytes
  Use Scribble <<"0">> at "./zzz-hlog-qc/3/3/-000000000002.HLOG" Pos 127

  Crash dump was written to: erl_crash.dump
  eheap_alloc: Cannot allocate 3087007804 bytes of memory (of type "heap_frag").
  make: *** [run-app1-interactive] Aborted

----

  ..nnNum scribbles = 2
  Pread "./zzz-hlog-qc/000000000023.HLOG" at Pos 183 for 6 bytes
  Use Scribble <<21,21,21,21,21,21>> at "./zzz-hlog-qc/000000000023.HLOG"
  Pos 183
  Pread "./zzz-hlog-qc/1/4/-000000000003.HLOG" at Pos 966 for 13 bytes
  Use Scribble <<"xxxxxxxxxxxxx">> at
  "./zzz-hlog-qc/1/4/-000000000003.HLOG" Pos 9
  66
  ErrList1  ErrList2 [{seq,23,err,badarg}] [{seq,-3,err,{badmatch,false}}]
  .Num scribbles = 1
  Pread "./zzz-hlog-qc/3/3/-000000000002.HLOG" at Pos 32 for 18 bytes
  Use Scribble
  <<140,140,140,140,140,140,140,140,140,140,140,140,140,140,140,140,
                 140,140>> at "./zzz-hlog-qc/3/3/-000000000002.HLOG" Pos
  	       32
  beam/erl_debug.c, line 361: ERTS_HOLE_MARKER found at 0x531ca148
  |            | Range: 0x4e391038 - 0xb29d74a4              |
  | Address    | Contents                                    |
  |------------|---------------------------------------------|
  | 0x4e391038 | 0x000000c0 0x4e391049 0x4e391044 0x00000000 |
  | 0x4e391048 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x4e391058 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x4e391068 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x4e391078 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x4e391088 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x4e391098 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x4e3910a8 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  [... 7 gazillions of nearly identical lines omitted...]
  | 0x531ca0f8 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca108 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca118 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca128 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca138 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca148 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca158 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca168 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca178 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca188 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  | 0x531ca198 | 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc 0xaf5e78cc |
  [... I didn't have enough disk space to record output to 0xb29d74a4...]

Attaching GDB to the process while QC is running doesn't work very
well.  After "attach" and "continue", the VM freezes.  Pressing C-c and
C-g (running GDB within Emacs in gdb-mode) can't interrupt GDB to get a
prompt back.  I can only kill the GDB process externally.  :-(  So
catching a SIGSEGV in-the-act hasn't been possible.  Is there some trick
to doing that?  Or perhaps FC11's GDB is braindamaged somehow?  Or
Linux?

  % gdb --version
  GNU gdb Fedora (6.8-32.fc10)
  [...]
  This GDB was configured as "i386-redhat-linux-gnu".

However, while all the ERTS_HOLE_MARKER stuff was dumping, I attached
GDB and got this info on all threads.

  (gdb) thread apply all info stack
  
  Thread 6 (Thread 0xb7d27b90 (LWP 12113)):
  #0  0xb8028416 in __kernel_vsyscall ()
  #1  0x0054f19b in read () from /lib/libpthread.so.0
  #2  0x081cb5ec in signal_dispatcher_thread_func (unused=0x0)
      at sys/unix/sys.c:2913
  #3  0x0823c0b6 in thr_wrapper (vtwd=0xbfaae2bc) at common/ethread.c:475
  #4  0x0054851f in start_thread () from /lib/libpthread.so.0
  #5  0x0047e04e in clone () from /lib/libc.so.6
  
  Thread 5 (Thread 0xb7326b90 (LWP 12114)):
  #0  0xb8028416 in __kernel_vsyscall ()
  #1  0x0054c105 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib/libpthread.so.0
  #2  0x0823cf29 in ethr_cond_wait (cnd=0x82affe0, mtx=0x82aff40)
      at common/ethread.c:1018
  #3  0x080c170e in erts_cnd_wait (cnd=0x82affe0, mtx=0x82aff40)
      at beam/erl_threads.h:634
  #4  0x080c26f8 in erts_smp_cnd_wait (cnd=0x82affe0, mtx=0x82aff40)
      at beam/erl_smp.h:417
  #5  0x080bf963 in sys_msg_dispatcher_func (unused=0x0)
      at beam/erl_trace.c:3078
  #6  0x0823c0b6 in thr_wrapper (vtwd=0xbfaae32c) at common/ethread.c:475
  #7  0x0054851f in start_thread () from /lib/libpthread.so.0
  #8  0x0047e04e in clone () from /lib/libc.so.6
  
  Thread 4 (Thread 0xb8026b90 (LWP 12115)):
  #0  0xb8028416 in __kernel_vsyscall ()
  #1  0x0054c105 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib/libpthread.so.0
  #2  0x0823cf29 in ethr_cond_wait (cnd=0x82d8c00, mtx=0x82d8bc0)
      at common/ethread.c:1018
  #3  0x080c170e in erts_cnd_wait (cnd=0x82d8c00, mtx=0x82d8bc0)
      at beam/erl_threads.h:634
  #4  0x081cb4a9 in child_waiter (unused=0x0) at sys/unix/sys.c:2842
  #5  0x0823c0b6 in thr_wrapper (vtwd=0xbfaae2cc) at common/ethread.c:475
  #6  0x0054851f in start_thread () from /lib/libpthread.so.0
  #7  0x0047e04e in clone () from /lib/libc.so.6
  
  Thread 3 (Thread 0xb6925b90 (LWP 12116)):
  #0  0xb8028416 in __kernel_vsyscall ()
  #1  0x0046db3b in write () from /lib/libc.so.6
  #2  0x00404a7c in _IO_new_file_write () from /lib/libc.so.6
  #3  0x00405bf7 in _IO_new_do_write () from /lib/libc.so.6
  #4  0x004055a5 in _IO_new_file_overflow () from /lib/libc.so.6
  #5  0x00408423 in __overflow () from /lib/libc.so.6
  #6  0x082410f7 in write_f_add_cr (vfp=0x50a4c0, buf=0x8250ffe "|\n", len=2)
      at common/erl_printf.c:108
  #7  0x08240c97 in erts_printf_format (fn=0x8241070 <write_f_add_cr>, 
      arg=0x50a4c0, fmt=0x8250ffe "|\n", ap=0xb6924174 "\b")
      at common/erl_printf_format.c:832
  #8  0x08241576 in erts_printf (format=0x8250ffe "|\n")
      at common/erl_printf.c:218
  #9  0x080ebf5e in print_untagged_memory (pos=0x641e0338, end=0xb29d74a8)
      at beam/erl_debug.c:518
  #10 0x080eba1c in check_memory (start=0x4e391038, end=0xb29d74a8)
      at beam/erl_debug.c:362
  #11 0x080eb987 in erts_check_for_holes (p=0xb4e32d94) at beam/erl_debug.c:347
  #12 0x08199ed4 in process_main () at beam/beam_emu.c:2049
  #13 0x080f6f67 in sched_thread_func (vesdp=0xb7e0bc00)
      at beam/erl_process.c:3015
  #14 0x0823c0b6 in thr_wrapper (vtwd=0xbfaae38c) at common/ethread.c:475
  #15 0x0054851f in start_thread () from /lib/libpthread.so.0
  #16 0x0047e04e in clone () from /lib/libc.so.6
  
  Thread 2 (Thread 0xb5f24b90 (LWP 12117)):
  #0  0xb8028416 in __kernel_vsyscall ()
  #1  0x0047e836 in epoll_wait () from /lib/libc.so.6
  #2  0x081d07e5 in check_fd_events (ps=0xb7daa400, tv=0xb5f2291c, max_res=256, 
      ps_locked=0xb5f228d4) at sys/common/erl_poll.c:1906
  #3  0x081d093b in erts_poll_wait_kp (ps=0xb7daa400, pr=0xb5f22928, 
      len=0xb5f22924, utvp=0xb5f2291c) at sys/common/erl_poll.c:2042
  #4  0x081d2f18 in erts_check_io_kp (do_wait=1)
      at sys/common/erl_check_io.c:1156
  #5  0x081cb4e1 in erl_sys_schedule (runnable=0) at sys/unix/sys.c:2860
  #6  0x080f0f3f in sched_sys_wait (no=2, rq=0xb7e0b758)
      at beam/erl_process.c:774
  #7  0x080fdcb9 in schedule (p=0xb4f4bd1c, calls=15) at beam/erl_process.c:6023
  #8  0x081887eb in process_main () at beam/beam_emu.c:1126
  #9  0x080f6f67 in sched_thread_func (vesdp=0xb7e1dcd0)
      at beam/erl_process.c:3015
  #10 0x0823c0b6 in thr_wrapper (vtwd=0xbfaae38c) at common/ethread.c:475
  #11 0x0054851f in start_thread () from /lib/libpthread.so.0
  #12 0x0047e04e in clone () from /lib/libc.so.6
  
  Thread 1 (Thread 0xb800e6c0 (LWP 12106)):
  #0  0xb8028416 in __kernel_vsyscall ()
  #1  0x004763d1 in select () from /lib/libc.so.6
  #2  0x081cb7d1 in erts_sys_main_thread () at sys/unix/sys.c:3021
  #3  0x080918b1 in erl_start (argc=62, argv=0xbfaae684) at beam/erl_init.c:1231
  #4  0x08072e66 in main (argc=Cannot access memory at address 0x0
  ) at sys/unix/erl_main.c:29
  (gdb) 

The ERTS_HOLE_MARKER thing happens much less frequently than the simple
SIGSEGV or out-of-memory thing, perhaps once every 20 minutes or so of
constant trying.  I've got a copy of the core image via "gcore", if it'd
help.

I'm at a bit of a loss of what other info I can provide, so please let
me know if I can send something else. 

-Scott


More information about the erlang-bugs mailing list