[erlang-questions] boring bug report

Claes Wikstrom klacke@REDACTED
Mon Nov 6 10:46:30 CET 2006


I'm running yaws at www.hyber.org and the beam process was fucked up.
I couldn't get a shell into the node, thus it is non trivial to figure
out what happened.

Anyway, I attach as much info as I can. I realize that this is
not sufficent for finding the bug, thus this report is more of a
existence proof showing that there is a bug in the central poll loop -
thus serious.




[root@REDACTED]src > uname -a
Linux gnejs 2.6.13-gentoo-r3 #1 Thu Oct 27 22:29:37 CEST 2005 i686 Intel(R) Pentium(R) 4 CPU 3.00GHz GenuineIntel GNU/Linux




First the strace, which shows that poll repeatedly reports 2 pipes
as ready to read, a read attempt is done which returns 0, but the
fds are still part of the readset. Note furtherdown that it looks as if the pipe
buffers are full.



poll([{fd=3, events=POLLIN|POLLRDNORM}, {fd=10, events=POLLIN|POLLRDNORM}, {fd=13, events=POLLIN|POLLRDNORM}, {fd=445, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}, {fd=341, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}], 5, 0) = 2
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2025}, NULL)  = 0
read(445, "", 65536)                    = 0
read(341, "", 65536)                    = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2147}, NULL)  = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2252}, NULL)  = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2358}, NULL)  = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2463}, NULL)  = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2568}, NULL)  = 0
times({tms_utime=48330945, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2673}, NULL)  = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2787}, NULL)  = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2892}, NULL)  = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 2997}, NULL)  = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 3102}, NULL)  = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 3208}, NULL)  = 0
poll([{fd=3, events=POLLIN|POLLRDNORM}, {fd=10, events=POLLIN|POLLRDNORM}, {fd=13, events=POLLIN|POLLRDNORM}, {fd=445, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}, {fd=341, events=POLLIN|POLLOUT|POLLRDNORM, revents=POLLHUP}], 5, 0) = 2
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661, 3338}, NULL)  = 0
read(445, "", 65536)                    = 0
read(341, "", 65536)                    = 0
times({tms_utime=48330946, tms_stime=61652394, tms_cutime=15118, tms_cstime=3101}) = -1847126572
gettimeofday({1162804661






[root@REDACTED]yaws > lsof  -p 13901 | egrep '445|341'
beam    13901 root   44u  IPv4 3072001              TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-134.googlebot.com:34149 (CLOSE_WAIT)
beam    13901 root  198u  IPv4 3393419              TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-42.googlebot.com:65535 (CLOSE_WAIT)
beam    13901 root  266u  IPv4 3576948              TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-42.googlebot.com:33417 (CLOSE_WAIT)
beam    13901 root  297u  IPv4 3594452              TCP h200n2fls31o899.telia.com:http->livebot-65-54-188-138.search.live.com:26155 (CLOSE_WAIT)
beam    13901 root  341r  FIFO     0,5          3651470 pipe
beam    13901 root  445r  FIFO     0,5          3829601 pipe
beam    13901 root  646u  IPv4 4113443              TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-138.googlebot.com:34155 (CLOSE_WAIT)
beam    13901 root  667u  IPv4 4139850              TCP h200n2fls31o899.telia.com:http->crawl-66-249-65-138.googlebot.com:41341 (CLOSE_WAIT)
[root@REDACTED]yaws >








=ERROR REPORT==== 5-Nov-2006::06:37:27 ===
Yaws process died: {{badmatch,{error,emfile}},
                     [{yaws_server,cache_file,4},
                      {yaws_server,handle_request,3},
                      {yaws_server,aloop,3},
                      {yaws_server,acceptor0,2},
                      {proc_lib,init_p,5}]}

=ERROR REPORT==== 5-Nov-2006::06:53:48 ===
Error in process <0.25662.18> with exit value: {enfile,[{erlang,open_port,[{spawn,"zip -q -1 -r - ."},[{cd,"/var/yaws/www"},use_stdio,binary,exit_status]]},{yaws_ls,zip,2}
]}





[root@REDACTED]yaws > ps aguxww | grep zip
root     29041  0.0  0.0   1740   552 pts/1    R+   10:25   0:00 grep zip







[root@REDACTED]yaws > top
top - 10:26:07 up 84 days, 11:20,  3 users,  load average: 0.99, 1.00, 1.00
Tasks:  97 total,   4 running,  93 sleeping,   0 stopped,   0 zombie
Cpu0  : 65.8% us,  7.6% sy,  0.0% ni,  0.0% id,  0.0% wa,  0.0% hi, 26.6% si
Mem:   1034188k total,   791604k used,   242584k free,   253548k buffers
Swap:  1004052k total,      148k used,  1003904k free,   184772k cached

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
13901 root      25   0 45636  38m 1632 R 71.1  3.8  18337:12 beam
  6599 root      16   0  2740 1440 1036 R 25.4  0.1   4840:28 gkrellmd
29048 root      15   0  2072 1092  832 R  1.3  0.1   0:00.02 top
     1 root      16   0  1428  484  420 S  0.0  0.0   0:02.83 init
     2 root      34  19     0    0    0 S  0.0  0.0   0:00.12 ksoftirqd/0
...........





[root@REDACTED]yaws > netstat -anp | grep beam
tcp        0      0 127.0.0.1:40101         0.0.0.0:*               LISTEN      13901/beam
tcp        0      0 0.0.0.0:80              0.0.0.0:*               LISTEN      13901/beam
tcp        1      0 217.209.72.200:80       66.249.72.208:39990     CLOSE_WAIT  13901/beam
tcp        1      0 217.209.72.200:80       65.55.213.33:19051      CLOSE_WAIT  13901/beam
tcp        1      0 217.209.72.200:80       66.249.72.104:40816     CLOSE_WAIT  13901/beam
tcp        1      0 217.209.72.200:80       66.249.72.104:61052     CLOSE_WAIT  13901/beam
tcp        1      0 217.209.72.200:80       66.249.72.104:54650     CLOSE_WAIT  13901/beam
......



Lot's and lots of CLOSE_WAIT sockets



[root@REDACTED]yaws > netstat -anp | grep beam  | wc -l
652




Accesslog showing last time a request was shipped.




[root@REDACTED]yaws > tail yaws.hyber.org.access
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:19 +0100] "GET /icons/pdf.gif HTTP/1.0" 404 261 "http://yaws.hyber.org/" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:22 +0100] "GET /favicon.ico HTTP/1.0" 404 259 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:49 +0100] "GET /favicon.ico HTTP/1.0" 404 259 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
S010600207801ae82.vc.shawcable.net - - [05/Nov/2006:06:57:58 +0100] "GET /index.yaws HTTP/1.0" 404 258 "http://yaws.hyber.org/" "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
crawl-66-249-65-138.googlebot.com - - [05/Nov/2006:06:58:34 +0100] "GET /redirect2.yaws HTTP/1.1" 404 262 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
lj612545.inktomisearch.com - - [05/Nov/2006:06:59:25 +0100] "GET /robots.txt HTTP/1.0" 404 258 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
lj611447.inktomisearch.com - - [05/Nov/2006:06:59:25 +0100] "GET /contact.yaws HTTP/1.0" 404 260 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
dsl-189-142-83-233.prod-infinitum.com.mx - - [05/Nov/2006:07:01:35 +0100] "GET / HTTP/1.1" 200 18088 "http://www.google.com.mx/search?hl=es&q=yaws&btnG=B%C3%BAsqueda+en+Google&meta=" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
dsl-189-142-83-233.prod-infinitum.com.mx - - [05/Nov/2006:07:01:41 +0100] "GET / HTTP/1.1" 200 18088 "http://www.google.com.mx/search?hl=es&q=yaws&btnG=B%C3%BAsqueda+en+Google&meta=" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"
dsl-189-142-83-233.prod-infinitum.com.mx - - [05/Nov/2006:07:01:41 +0100] "GET /icons/back.gif HTTP/1.1" 404 262 "http://yaws.hyber.org/" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0"






The erlang version,

[root@REDACTED]src > /usr/local/bin/erl
Erlang (BEAM) emulator version 5.4.10 [source] [hipe]

Eshell V5.4.10  (abort with ^G)
1>


-- 
Claes Wikstrom                        -- Caps lock is nowhere and
http://www.hyber.org                  -- everything is under control
cellphone: +46 70 2097763



More information about the erlang-questions mailing list