Kernel poll Linux 2.6 support

Mickael Remond mickael.remond@REDACTED
Fri Dec 16 10:59:20 CET 2005


* Mickael Remond <mickael.remond@REDACTED> [2005-12-16 10:56:44 +0100]:

> Hello,
> 
> Here is a patch we have been using with succes on several production
> site.
> It enables Linux 2.6 kernel poll support
> (enabled with ./configure --enable-kernel-poll) 
> 
> Then, you can enable Kernel polling from the command-line
> ./bin/erl +K true 
> (This is what has already been integrated for FreeBSD).
> 
> I have talk about it with some developers during EUC, and it seems that
> it could be a good addition to Erlang/OTP.
> Heavy networking is really impressive with this patch: CPU consumption
> is dramatically reduced.

Here is the patch.

-- 
Mickaël Rémond
 http://www.process-one.net/
-------------- next part --------------
diff -ru ORIG_otp_src_R10B-8/erts/acconfig.h otp_src_R10B-8/erts/acconfig.h
--- ORIG_otp_src_R10B-8/erts/acconfig.h	2005-06-21 09:37:40.000000000 -0700
+++ otp_src_R10B-8/erts/acconfig.h	2005-11-09 14:25:58.330158793 -0800
@@ -188,6 +188,9 @@
 /* Define if you have the <linux/kpoll.h> header file. */
 #undef HAVE_LINUX_KPOLL_H
  
+/* Define if you have the <linux/epoll.h> header file. */
+#undef HAVE_LINUX_EPOLL_H
+
 /* Define if you have the <sys/event.h> header file. */
 #undef HAVE_SYS_EVENT_H
 
@@ -224,7 +227,7 @@
 
 #if !defined(USE_SELECT)
 #  if defined(ENABLE_KERNEL_POLL)
-#    if defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_LINUX_KPOLL_H) || defined(HAVE_SYS_EVENT_H)
+#    if defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_LINUX_KPOLL_H) || defined(HAVE_SYS_EVENT_H) || defined(HAVE_LINUX_KPOLL_H)
 #      define USE_KERNEL_POLL
 #    endif
 #  endif
diff -ru ORIG_otp_src_R10B-8/erts/config.h.in otp_src_R10B-8/erts/config.h.in
--- ORIG_otp_src_R10B-8/erts/config.h.in	2005-10-25 02:50:42.000000000 -0700
+++ otp_src_R10B-8/erts/config.h.in	2005-11-09 14:25:58.330158793 -0800
@@ -175,6 +175,8 @@
 /* Define if you have the <linux/kpoll.h> header file. */
 #undef HAVE_LINUX_KPOLL_H
 
+/* Define if you have the <linux/epoll.h> header file. */
+#undef HAVE_LINUX_EPOLL_H
  
 /* Define if you have the <sys/event.h> header file. */
 #undef HAVE_SYS_EVENT_H
@@ -399,7 +401,7 @@
 
 #if !defined(USE_SELECT)
 #  if defined(ENABLE_KERNEL_POLL)
-#    if defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_LINUX_KPOLL_H) || defined(HAVE_SYS_EVENT_H)
+#    if defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_LINUX_KPOLL_H) || defined(HAVE_SYS_EVENT_H) || defined(HAVE_LINUX_EPOLL_H)
 #      define USE_KERNEL_POLL
 #    endif
 #  endif
Only in otp_src_R10B-8/erts: config.h.in.orig
diff -ru ORIG_otp_src_R10B-8/erts/configure otp_src_R10B-8/erts/configure
--- ORIG_otp_src_R10B-8/erts/configure	2005-10-25 02:50:36.000000000 -0700
+++ otp_src_R10B-8/erts/configure	2005-11-09 14:25:58.334158707 -0800
@@ -3716,6 +3716,151 @@
 fi
  
 
+
+if test "${ac_cv_header_sys_epoll_h+set}" = set; then
+  echo "$as_me:$LINENO: checking for sys/epoll.h" >&5
+echo $ECHO_N "checking for sys/epoll.h... $ECHO_C" >&6
+if test "${ac_cv_header_sys_epoll_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+fi
+echo "$as_me:$LINENO: result: $ac_cv_header_sys_epoll_h" >&5
+echo "${ECHO_T}$ac_cv_header_sys_epoll_h" >&6
+else
+  # Is the header compilable?
+echo "$as_me:$LINENO: checking sys/epoll.h usability" >&5
+echo $ECHO_N "checking sys/epoll.h usability... $ECHO_C" >&6
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+#include <sys/epoll.h>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
+  (eval $ac_compile) 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag"
+			 || test ! -s conftest.err'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest.$ac_objext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_header_compiler=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ac_header_compiler=no
+fi
+rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
+echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+echo "${ECHO_T}$ac_header_compiler" >&6
+
+# Is the header present?
+echo "$as_me:$LINENO: checking sys/epoll.h presence" >&5
+echo $ECHO_N "checking sys/epoll.h presence... $ECHO_C" >&6
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <sys/epoll.h>
+_ACEOF
+if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5
+  (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null; then
+  if test -s conftest.err; then
+    ac_cpp_err=$ac_c_preproc_warn_flag
+    ac_cpp_err=$ac_cpp_err$ac_c_werror_flag
+  else
+    ac_cpp_err=
+  fi
+else
+  ac_cpp_err=yes
+fi
+if test -z "$ac_cpp_err"; then
+  ac_header_preproc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.$ac_ext
+echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+echo "${ECHO_T}$ac_header_preproc" >&6
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+  yes:no: )
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h: accepted by the compiler, rejected by the preprocessor!" >&5
+echo "$as_me: WARNING: sys/epoll.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h: proceeding with the compiler's result" >&5
+echo "$as_me: WARNING: sys/epoll.h: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h: present but cannot be compiled" >&5
+echo "$as_me: WARNING: sys/epoll.h: present but cannot be compiled" >&2;}
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h:     check for missing prerequisite headers?" >&5
+echo "$as_me: WARNING: sys/epoll.h:     check for missing prerequisite headers?" >&2;}
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h: see the Autoconf documentation" >&5
+echo "$as_me: WARNING: sys/epoll.h: see the Autoconf documentation" >&2;}
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h:     section \"Present But Cannot Be Compiled\"" >&5
+echo "$as_me: WARNING: sys/epoll.h:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h: proceeding with the preprocessor's result" >&5
+echo "$as_me: WARNING: sys/epoll.h: proceeding with the preprocessor's result" >&2;}
+    { echo "$as_me:$LINENO: WARNING: sys/epoll.h: in the future, the compiler will take precedence" >&5
+echo "$as_me: WARNING: sys/epoll.h: in the future, the compiler will take precedence" >&2;}
+    (
+      cat <<\_ASBOX
+## ------------------------------------------ ##
+## Report this to the AC_PACKAGE_NAME lists.  ##
+## ------------------------------------------ ##
+_ASBOX
+    ) |
+      sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+echo "$as_me:$LINENO: checking for sys/epoll.h" >&5
+echo $ECHO_N "checking for sys/epoll.h... $ECHO_C" >&6
+if test "${ac_cv_header_sys_epoll_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_cv_header_sys_epoll_h=$ac_header_preproc
+fi
+echo "$as_me:$LINENO: result: $ac_cv_header_sys_epoll_h" >&5
+echo "${ECHO_T}$ac_cv_header_sys_epoll_h" >&6
+
+fi
+if test $ac_cv_header_sys_epoll_h = yes; then
+  cat >>confdefs.h <<\_ACEOF
+#define HAVE_LINUX_EPOLL_H 1
+_ACEOF
+
+fi
+
 echo $ac_n "checking for SO_BSDCOMPAT declaration""... $ac_c" 1>&6
 echo "configure:3721: checking for SO_BSDCOMPAT declaration" >&5
 if eval "test \"`echo '$''{'ac_cv_decl_so_bsdcompat'+set}'`\" = set"; then
diff -ru ORIG_otp_src_R10B-8/erts/configure.in otp_src_R10B-8/erts/configure.in
--- ORIG_otp_src_R10B-8/erts/configure.in	2005-10-24 02:43:05.000000000 -0700
+++ otp_src_R10B-8/erts/configure.in	2005-11-09 14:25:58.335158685 -0800
@@ -651,6 +651,7 @@
 AC_CHECK_HEADER(sys/devpoll.h, AC_DEFINE(HAVE_SYS_DEVPOLL_H))
 AC_CHECK_HEADER(linux/kpoll.h, AC_DEFINE(HAVE_LINUX_KPOLL_H))
 AC_CHECK_HEADER(sys/event.h, AC_DEFINE(HAVE_SYS_EVENT_H)) 
+AC_CHECK_HEADER(sys/epoll.h, AC_DEFINE(HAVE_LINUX_EPOLL_H))
 
 LM_DECL_SO_BSDCOMPAT
 LM_DECL_INADDR_LOOPBACK
Only in otp_src_R10B-8/erts: configure.in.orig
Only in otp_src_R10B-8/erts: configure.orig
diff -ru ORIG_otp_src_R10B-8/erts/emulator/sys/unix/sys.c otp_src_R10B-8/erts/emulator/sys/unix/sys.c
--- ORIG_otp_src_R10B-8/erts/emulator/sys/unix/sys.c	2005-08-29 06:13:36.000000000 -0700
+++ otp_src_R10B-8/erts/emulator/sys/unix/sys.c	2005-11-09 14:25:58.337158642 -0800
@@ -13,6 +13,12 @@
  * Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings
  * AB. All Rights Reserved.''
  * 
+ * 2005-08-31
+ * This has been modified by Matthew Reilly of SIPphone Inc. to 
+ * enable kernel poll (+K true) support via the epoll mechanism in Linux 2.6
+ * Portions created by SIPphone Inc. are Copyright 2005, SIPphone Inc.
+ * These modifications are released under the Erlang Public License.
+ *
  *     $Id$
  */
 
@@ -50,6 +56,9 @@
 #      define USE_DEVPOLL
 #      include <sys/devpoll.h>
 #    endif
+#    ifdef HAVE_LINUX_EPOLL_H /* Too minimize code changes, we pretend we have HAVE_LINUX_KPOLL_H as well */
+#      define HAVE_LINUX_KPOLL_H 1
+#    endif
 #    ifdef HAVE_LINUX_KPOLL_H
 #      define USE_DEVPOLL
 #      include <asm/page.h>
@@ -58,7 +67,11 @@
 #      ifndef POLLREMOVE
 #        define POLLREMOVE 0x1000 /* some day it will make it to bits/poll.h ;-) */
 #      endif
-#      include <linux/kpoll.h>
+#      ifdef HAVE_LINUX_EPOLL_H
+#        include <sys/epoll.h>
+#      else
+#        include <linux/kpoll.h>
+#      endif
 #    endif
 #    ifdef USE_DEVPOLL /* can only use one of them ... */
 #      ifdef USE_KQUEUE
@@ -201,8 +214,17 @@
 
 static int             dev_poll_fd;   /* fd for /dev/poll */
 #ifdef HAVE_LINUX_KPOLL_H
+
+#ifdef HAVE_LINUX_EPOLL_H
+static struct epoll_event* dev_epoll_map;
+/* XXX Implement correct mapping from POLLIN/POLLOUT to/from EPOLLIN/EPOLLOUT */
+/* Currenltly POLLIN/POLLOUT == EPOLLIN/EPOLLOUT. So these macros will work */
+#define EPOLL_TO_POLL(bit_map) (bit_map)
+#define POLL_TO_EPOLL(bit_map) (bit_map & (EPOLLIN|EPOLLOUT))
+#else
 static char *          dev_poll_map;  /* mmap'ed area from kernel /dev/kpoll */
 static struct k_poll   dev_poll;      /* control block for /dev/kpoll */
+#endif /* HAVE_LINUX_KPOLL_H */
 static int max_poll_idx;              /* highest non /dev/kpoll fd */
 
 static void kpoll_enable();
@@ -212,7 +234,7 @@
 static struct pollfd*  dev_poll_rfds = NULL; /* Allocated at startup */
 
 static void devpoll_init(void);
-static void devpoll_update_pix(int pix);
+static void devpoll_update_pix(int pix, int old_events);
 #ifdef HAVE_SYS_DEVPOLL_H
 static void devpoll_clear_pix(int pix);
 #endif /* HAVE_SYS_DEVPOLL_H */
@@ -2021,7 +2043,7 @@
 
 #ifdef USE_DEVPOLL
 	    if (poll_fds[pix].events != old_events) 
-                devpoll_update_pix(pix);
+                devpoll_update_pix(pix, old_events);
 #endif
 #ifdef USE_KQUEUE
 	    if (poll_fds[pix].events != old_events) 
@@ -2077,7 +2099,7 @@
 	    if ( old_events && (dev_poll_fd != -1) ) {
 	       /* Tell /dev/[k]poll that we are not interested any more ... */
 	       poll_fds[pix].events = POLLREMOVE;
-	       devpoll_update_pix(pix);
+	       devpoll_update_pix(pix, old_events);
 	       /* devpoll_update_pix may change the pix */
 	       pix = fd_data[fd].pix;
 	       poll_fds[pix].events = 0;
@@ -2134,7 +2156,7 @@
 #ifdef HAVE_SYS_DEVPOLL_H
 	    devpoll_clear_pix(pix);
 #endif /* HAVE_SYS_DEVPOLL_H */
-	    devpoll_update_pix(pix);
+	    devpoll_update_pix(pix, old_events);
 	}
 #endif
 #ifdef USE_KQUEUE
@@ -2692,6 +2714,27 @@
 	nof_ready_fds = vr;
 
 #if HAVE_LINUX_KPOLL_H
+#ifdef HAVE_LINUX_EPOLL_H
+	if ( do_event_poll ) {
+           if ((r = epoll_wait(dev_poll_fd,dev_epoll_map,max_fd_plus_one,0)) > 0) {
+	    for (i = 0; (i < r); i++) {
+	      short revents = dev_epoll_map[i].events;
+
+	      if (revents != 0) {
+	        int fd = dev_epoll_map[i].data.fd;
+		rp->pfd.fd = fd;
+		rp->pfd.events = poll_fds[fd_data[fd].pix].events;
+		rp->pfd.revents = EPOLL_TO_POLL(revents);
+		rp->iport = fd_data[fd].inport;
+		rp->oport = fd_data[fd].outport;
+		rp++;
+	        nof_ready_fds ++;
+	      } 
+	    }
+           }
+        }
+
+#else
 	if ( do_event_poll ) {
 	  /* Now do the fast poll */
 	  dev_poll.kp_timeout = 0;
@@ -2714,6 +2757,7 @@
 	    nof_ready_fds += r;
 	  }
 	}
+#endif /*HAVE_LINUX_EPOLL_H */
 #endif
 
       } else {
@@ -3622,6 +3666,20 @@
     poll_fds[pix].revents = 0;
 }
 
+#ifdef HAVE_LINUX_EPOLL_H
+static void epoll_init()
+{
+    /* max_files is just a hint to the kernel */
+    if ( (dev_poll_fd=epoll_create(max_files)) < 0 ) {
+        DEBUGF(("Will use poll()\n"));
+        dev_poll_fd = -1; /* We will not use ekpoll */
+    } else {
+        DEBUGF(("Will use epoll\n"));
+        dev_epoll_map = (struct epoll_event *) erts_alloc(ERTS_ALC_T_POLL_FDS, (sizeof(struct epoll_event) * max_files));
+	erts_sys_misc_mem_sz += sizeof(struct epoll_event) * max_files;
+    }
+}
+#else
 static void kpoll_init()
 {
     if ( (dev_poll_fd=open("/dev/kpoll",O_RDWR)) < 0 ) {
@@ -3643,6 +3701,7 @@
       dev_poll_rfds =  NULL;
     }
 }
+#endif /* HAVE_LINUX_EPOLL_H */
 
 #endif /* HAVE_LINUX_KPOLL_H */
 
@@ -3672,7 +3731,11 @@
     } else {
         /* Determine use of poll vs. /dev/poll at runtime */
 #ifdef HAVE_LINUX_KPOLL_H
+#ifdef HAVE_LINUX_EPOLL_H
+        epoll_init();
+#else
         kpoll_init();
+#endif
 #else
 #ifdef HAVE_SYS_DEVPOLL_H
         solaris_devpoll_init();
@@ -3698,7 +3761,7 @@
     return count;
 }
 
-static void devpoll_update_pix(int pix)
+static void devpoll_update_pix(int pix, int old_events)
 {
     int res;
 
@@ -3713,10 +3776,33 @@
 
 #endif
     if ( dev_poll_fd != -1 ) {
+#ifdef HAVE_LINUX_EPOLL_H
+       int events = poll_fds[pix].events;
+       int fd = poll_fds[pix].fd;
+       if (old_events && events & POLLREMOVE) {
+            /* Delete file descriptor from epoll list */
+            res = epoll_ctl(dev_poll_fd,EPOLL_CTL_DEL,fd,NULL);
+            /* XXX check return code */
+       } else {
+            struct epoll_event epoll_ctl_event;
+            epoll_ctl_event.data.fd = fd;
+            epoll_ctl_event.events = POLL_TO_EPOLL(events);
+            if (old_events) {
+                /* Modify exiting fd */
+                res = epoll_ctl(dev_poll_fd,EPOLL_CTL_MOD,fd,&epoll_ctl_event);
+                /* XXX check return code */
+            } else {
+                /* Add fd to epoll list */
+                res = epoll_ctl(dev_poll_fd,EPOLL_CTL_ADD,fd,&epoll_ctl_event);
+                /* XXX check return code */
+            } 
+       }
+#else
         if ( (res=devpoll_write(dev_poll_fd,&poll_fds[pix],sizeof(struct pollfd))) != 
              (sizeof(struct pollfd)) ) {
             erl_exit(1,"Can't write to /dev/poll\n");
         }
+#endif /* HAVE_LINUX_EPOLL_H */
     }
 #if HAVE_LINUX_KPOLL_H
     } else {


More information about the erlang-patches mailing list