[erlang-questions] boring bug report
Claes Wikstrom
klacke@REDACTED
Mon Nov 6 10:46:30 CET 2006
I'm running yaws at www.hyber.org and the beam process was fucked up.
I couldn't get a shell into the node, thus it is non trivial to figure
out what happened.
Anyway, I attach as much info as I can. I realize that this is
not sufficent for finding the bug, thus this report is more of a
existence proof showing that there is a bug in the central poll loop -
thus serious.
[root@REDACTED]src > uname -a
Linux gnejs 2.6.13-gentoo-r3 #1 Thu Oct 27 22:29:37 CEST 2005 i686 Intel(R) Pentium(R) 4 CPU 3.00GHz GenuineIntel GNU/Linux
First the strace, which shows that poll repeatedly reports 2 pipes
as ready to read, a read attempt is done which returns 0, but the
fds are still part of the readset. Note furtherdown that it looks as if the pipe
buffers are full.
poll([{fd=3, events=POLLIN|POLLRDNORM}, {fd=10, events=POLLIN|POLLRDNORM}, {fd=13, events=POLLIN|POLLRDNORM}, {fd=445, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}, {fd=341, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}], 5, 0) = 2
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2025}, NULL) = 0
read(445, "", 65536) = 0
read(341, "", 65536) = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2147}, NULL) = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2252}, NULL) = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2358}, NULL) = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2463}, NULL) = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2568}, NULL) = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2673}, NULL) = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2787}, NULL) = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2892}, NULL) = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2997}, NULL) = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 3102}, NULL) = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 3208}, NULL) = 0
poll([{fd=3, events=POLLIN|POLLRDNORM}, {fd=10, events=POLLIN|POLLRDNORM}, {fd=13, events=POLLIN|POLLRDNORM}, {fd=445, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}, {fd=341, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}], 5, 0) = 2
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 3338}, NULL) = 0
read(445, "", 65536) = 0
read(341, "", 65536) = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661
[root@REDACTED]yaws > lsof -p 13901 | egrep '445|341'
beam 13901 root 44u IPv4 3072001 TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-134.googlebot.com:34149 (CLOSE_WAIT)
beam 13901 root 198u IPv4 3393419 TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-42.googlebot.com:65535 (CLOSE_WAIT)
beam 13901 root 266u IPv4 3576948 TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-42.googlebot.com:33417 (CLOSE_WAIT)
beam 13901 root 297u IPv4 3594452 TCP h200n2fls31o899.telia.com:http->livebot-65-54-188-138.search.live.com:26155 (CLOSE_WAIT)
beam 13901 root 341r FIFO 0,5 3651470 pipe
beam 13901 root 445r FIFO 0,5 3829601 pipe
beam 13901 root 646u IPv4 4113443 TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-138.googlebot.com:34155 (CLOSE_WAIT)
beam 13901 root 667u IPv4 4139850 TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-138.googlebot.com:41341 (CLOSE_WAIT)
[root@REDACTED]yaws >
=ERROR REPORT==== 5-Nov-2006::06:37:27 ===
Yaws process died: {{badmatch,{error,emfile}},
[{yaws_server,cache_file,4},
{yaws_server,handle_request,3},
{yaws_server,aloop,3},
{yaws_server,acceptor0,2},
{proc_lib,init_p,5}]}
=ERROR REPORT==== 5-Nov-2006::06:53:48 ===
Error in process <0.25662.18> with exit value: {enfile,[{erlang,open_port,[{spawn,"zip -q -1 -r - ."},[{cd,"/var/yaws/www"},use_stdio,binary,exit_status]]},{yaws_ls,zip,2}
]}
[root@REDACTED]yaws > ps aguxww | grep zip
root 29041 0.0 0.0 1740 552 pts/1 R+ 10:25 0:00 grep zip
[root@REDACTED]yaws > top
top - 10:26:07 up 84 days, 11:20, 3 users, load average: 0.99, 1.00, 1.00
Tasks: 97 total, 4 running, 93 sleeping, 0 stopped, 0 zombie
Cpu0 : 65.8% us, 7.6% sy, 0.0% ni, 0.0% id, 0.0% wa, 0.0% hi, 26.6% si
Mem: 1034188k total, 791604k used, 242584k free, 253548k buffers
Swap: 1004052k total, 148k used, 1003904k free, 184772k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
13901 root 25 0 45636 38m 1632 R 71.1 3.8 18337:12 beam
6599 root 16 0 2740 1440 1036 R 25.4 0.1 4840:28 gkrellmd
29048 root 15 0 2072 1092 832 R 1.3 0.1 0:00.02 top
1 root 16 0 1428 484 420 S 0.0 0.0 0:02.83 init
2 root 34 19 0 0 0 S 0.0 0.0 0:00.12 ksoftirqd/0
...........
[root@REDACTED]yaws > netstat -anp | grep beam
tcp 0 0 127.0.0.1:40101 0.0.0.0:* LISTEN 13901/beam
tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN 13901/beam
tcp 1 0 217.209.72.200:80 66.249.72.208:39990 CLOSE_WAIT 13901/beam
tcp 1 0 217.209.72.200:80 65.55.213.33:19051 CLOSE_WAIT 13901/beam
tcp 1 0 217.209.72.200:80 66.249.72.104:40816 CLOSE_WAIT 13901/beam
tcp 1 0 217.209.72.200:80 66.249.72.104:61052 CLOSE_WAIT 13901/beam
tcp 1 0 217.209.72.200:80 66.249.72.104:54650 CLOSE_WAIT 13901/beam
......
Lot's and lots of CLOSE_WAIT sockets
[root@REDACTED]yaws > netstat -anp | grep beam | wc -l
652
Accesslog showing last time a request was shipped.
[root@REDACTED]yaws > tail yaws.hyber.org.access
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:19 +0100] "GET /icons/pdf.gif HTTP/1.0" 404 261 "http://yaws.hyber.org/" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:22 +0100] "GET /favicon.ico HTTP/1.0" 404 259 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:49 +0100] "GET /favicon.ico HTTP/1.0" 404 259 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:58 +0100] "GET /index.yaws HTTP/1.0" 404 258 "http://yaws.hyber.org/" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
crawl-66-249-65-138.googlebot.com - - [05/Nov/2006:06:58:34 +0100] "GET /redirect2.yaws HTTP/1.1" 404 262 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
lj612545.inktomisearch.com - - [05/Nov/2006:06:59:25 +0100] "GET /robots.txt HTTP/1.0" 404 258 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
lj611447.inktomisearch.com - - [05/Nov/2006:06:59:25 +0100] "GET /contact.yaws HTTP/1.0" 404 260 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
dsl-189-142-83-233.prod-infinitum.com.mx - - [05/Nov/2006:07:01:35 +0100] "GET / HTTP/1.1" 200 18088 "http://www.google.com.mx/search?hl=es&q=yaws&btnG=B%C3%BAsqueda+en+Google&meta=" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
dsl-189-142-83-233.prod-infinitum.com.mx - - [05/Nov/2006:07:01:41 +0100] "GET / HTTP/1.1" 200 18088 "http://www.google.com.mx/search?hl=es&q=yaws&btnG=B%C3%BAsqueda+en+Google&meta=" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
dsl-189-142-83-233.prod-infinitum.com.mx - - [05/Nov/2006:07:01:41 +0100] "GET /icons/back.gif HTTP/1.1" 404 262 "http://yaws.hyber.org/" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
The erlang version,
[root@REDACTED]src > /usr/local/bin/erl
Erlang (BEAM) emulator version 5.4.10 [source] [hipe]
Eshell V5.4.10 (abort with ^G)
1>
--
Claes Wikstrom -- Caps lock is nowhere and
http://www.hyber.org -- everything is under control
cellphone: +46 70 2097763
More information about the erlang-questions
mailing list