Sendfile in erlang

Miguel Barreiro enano@REDACTED
Fri Nov 14 12:27:08 CET 2003


Hi,

I'm not sure whether the proper list is e-questions or e-patches, but this
is a small patch anyway.

This draft patch adds a sendfile() interface to Erlang. Sendfile(2) is a
system call present in Linux 2.2, AIX 5 and later kernels, and similar
interfaces are present in recent versions of Solaris and possibly other
Unices. The usual loop of read()ing from a file and then send()ing to a
socket or write()ing to another file has an unnecesarily large overhead:
copying data from kernel space to user space on read, and then back to
kernel space again on write() or send(). Besides, if we are reading from
Erlang, that means getting all those data chunks into the erlang runtime
memory management system only to get them out again immediately and then
GC them sometime in the future. Very often (think of a web or file server)
our program has no use for that read data except sending it out again.

Sendfile(f,t,o,c) simply instructs the kernel (the OS kernel, not
$ROOTDIR/lib/kernel) to read c bytes at offset o of file descriptor f and
write them again to file descriptor t. No data is moved to/from user
space.


ObPerfData: a cycle of file:read() and gen_tcp:send() moving 4KB chunks
over 1000Base-T between 1GHz Pentium3 machines sustains a throughput of
about 55Mbps. A cycle of file:sendfile() calls sustains over 410Mbps down
the pipe. Make sure you have a well supported network card before trying.

The patch is for testing purposes - I'd be glad to hear comments. I have
kept the kernel sendfile semantics: it may write less bytes than
requested, just like the send(2) syscall; return value is {ok, SentBytes}
or {error, Reason}. Maybe it would be more polite to behave like
gen_tcp:send instead and make sure all data is sent, or else return an
error. More ugly details: it needs the socket descriptor *number*, so for
now you have to call the undocumented function get_fd in prim_inet. An
example:

    {ok,From}=file:open(Filename,[read,raw]),
    {ok,Sock}=gen_tcp:connect(Host,Port,[binary,{packet,0}]),
    {ok,SockFD}=prim_inet:getfd(Sock),
    {ok,Sent}=file:sendfile(From,SockFD,Pos,Block),


No guarantees, backup first, parachute not included, etc.

Regards,

Miguel
-------------- next part --------------
diff -ur ../orig/otp_src_R9B-1/erts/emulator/drivers/common/efile_drv.c ./erts/emulator/drivers/common/efile_drv.c
--- ../orig/otp_src_R9B-1/erts/emulator/drivers/common/efile_drv.c	Wed Oct  9 16:22:22 2002
+++ ./erts/emulator/drivers/common/efile_drv.c	Fri Nov 14 11:42:26 2003
@@ -50,6 +50,7 @@
 #define FILE_PREADV		25
 #define FILE_SETOPT		26
 #define FILE_IPREAD             27
+#define FILE_SENDFILE           28
 
 /* Return codes */
 
@@ -317,6 +318,11 @@
 	    size_t        offset;
 	    char          name[1];
 	} read_file;
+        struct {
+	  Sint          destfd;
+	  off_t         offset;
+	  size_t        size;
+	} sendfile;
     } c;
     char b[1];
 };
@@ -799,6 +805,32 @@
     invoke_name(data, efile_chdir);
 }
 
+
+static void invoke_sendfile(void *data)
+{
+  struct t_data *d = (struct t_data *) data;
+  int fd = (int) d->fd;
+  int destfd = (int) d->c.sendfile.destfd;
+  off_t offset = (off_t) d->c.sendfile.offset;
+
+
+  if ((d->result_ok = efile_sendfile(&d->errInfo, fd, destfd, offset,
+				     &(d->c.sendfile.size))))
+      d->again=0;
+  else {
+    switch(d->errInfo.posix_errno){
+  case 0: /*ok*/
+  case EBADF:
+  case EINVAL:
+  case EIO:
+    d->again = 0;
+    break;
+  default:
+    d->again = 1;
+    };
+  }
+}
+
 static void invoke_fsync(void *data)
 {
     struct t_data *d = (struct t_data *) data;
@@ -1563,6 +1595,12 @@
 	  }
 	  free_data(data);
 	  break;
+#ifdef __linux__
+    case FILE_SENDFILE:
+      reply_Uint(desc, d->c.sendfile.size);
+      free_data(data);
+    break;
+#endif
       case FILE_MKDIR:
       case FILE_RMDIR:
       case FILE_CHDIR:
@@ -1713,6 +1751,22 @@
     command = *(uchar*)buf++;
 
     switch(command) {
+
+    case FILE_SENDFILE:
+    {
+      d = EF_SAFE_ALLOC(sizeof(struct t_data) -1 +20);
+
+      d->fd = fd;
+      d->c.sendfile.destfd = get_int32(buf);
+      d->c.sendfile.offset = get_int32(buf + 8); /* TODO: get_64? */
+      d->c.sendfile.size = get_int32(buf + 16);  /* idem */
+      d->command = command;
+      d->invoke = invoke_sendfile;
+      d->free = free_data;
+      d->level = 2; /*?*/
+      goto done;
+    }
+      
 
     case FILE_MKDIR:
     {
diff -ur ../orig/otp_src_R9B-1/erts/emulator/drivers/common/erl_efile.h ./erts/emulator/drivers/common/erl_efile.h
--- ../orig/otp_src_R9B-1/erts/emulator/drivers/common/erl_efile.h	Wed Oct  2 23:20:42 2002
+++ ./erts/emulator/drivers/common/erl_efile.h	Mon Nov 10 04:35:56 2003
@@ -148,8 +148,10 @@
 		   char* buffer, size_t size);
 int efile_link(Efile_error* errInfo, char* old, char* new);
 int efile_symlink(Efile_error* errInfo, char* old, char* new);
-
-
+#ifdef __linux__
+int efile_sendfile(Efile_error* errInfo, int fd, int outfd, off_t offset,
+		   size_t* count);
+#endif
 
 
 
diff -ur ../orig/otp_src_R9B-1/erts/emulator/drivers/unix/unix_efile.c ./erts/emulator/drivers/unix/unix_efile.c
--- ../orig/otp_src_R9B-1/erts/emulator/drivers/unix/unix_efile.c	Wed Oct  2 23:20:57 2002
+++ ./erts/emulator/drivers/unix/unix_efile.c	Fri Nov 14 11:43:42 2003
@@ -72,6 +72,10 @@
 #  endif
 #endif /* !VXWORKS */
 
+#ifdef __linux__
+#include <sys/sendfile.h>
+#endif
+
 #ifdef SUNOS4
 #  define getcwd(buf, size) getwd(buf)
 #endif
@@ -775,6 +779,24 @@
 {
     close(fd);
 }
+
+
+#ifdef __linux__
+int
+efile_sendfile(Efile_error* errInfo,
+	       int fd,
+	       int outfd,
+	       off_t offset,
+	       size_t* countptr
+	       )
+{
+  int r;
+  r=sendfile(outfd, fd, &offset, *countptr);
+  if(r>=0) *countptr=r;
+  return check_error(r,errInfo);
+}
+
+#endif
 
 int
 efile_fsync(Efile_error *errInfo, /* Where to return error codes. */
Binary files ../orig/otp_src_R9B-1/lib/kernel/ebin/prim_file.beam and ./lib/kernel/ebin/prim_file.beam differ
diff -ur ../orig/otp_src_R9B-1/lib/kernel/src/file.erl ./lib/kernel/src/file.erl
--- ../orig/otp_src_R9B-1/lib/kernel/src/file.erl	Wed Oct  2 23:15:30 2002
+++ ./lib/kernel/src/file.erl	Fri Nov 14 11:34:48 2003
@@ -37,7 +37,7 @@
 -export([open/2, close/1, 
 	 read/2, write/2, 
 	 pread/2, pread/3, pwrite/2, pwrite/3,
-	 position/2, truncate/1, sync/1,
+	 position/2, truncate/1, sendfile/4, sync/1,
 	 copy/2, copy/3]).
 %% High level operations
 -export([consult/1, path_consult/2, eval/1, path_eval/2, path_open/3]).
@@ -562,6 +562,12 @@
 pwrite(_, _, _) ->
     {error, einval}.
 
+
+%enano@REDACTED
+sendfile(#file_descriptor{module = Module} = Handle, DestFD, Offset, Bytes) ->
+    Module:sendfile(Handle, DestFD, Offset, Bytes);
+sendfile(Other,_,_,_) ->
+    {error, einval}.
 
 
 sync(File) when pid(File) ->
diff -ur ../orig/otp_src_R9B-1/lib/kernel/src/prim_file.erl ./lib/kernel/src/prim_file.erl
--- ../orig/otp_src_R9B-1/lib/kernel/src/prim_file.erl	Wed Oct  2 23:30:06 2002
+++ ./lib/kernel/src/prim_file.erl	Fri Nov 14 11:35:39 2003
@@ -24,7 +24,7 @@
 %%% Interface towards a single file's contents. Uses ?FD_DRV.
 
 %% Generic file contents operations
--export([open/2, close/1, sync/1, position/2, truncate/1,
+-export([open/2, close/1, sendfile/4, sync/1, position/2, truncate/1,
 	 write/2, pwrite/2, pwrite/3, read/2, pread/2, pread/3, copy/3]).
 
 %% Specialized file operations
@@ -92,6 +92,7 @@
 -define(FILE_PREADV,           25).
 -define(FILE_SETOPT,           26).
 -define(FILE_IPREAD,           27).
+-define(FILE_SENDFILE,         28).
 
 %% Driver responses
 -define(FILE_RESP_OK,          0).
@@ -284,7 +285,10 @@
 pwrite(#file_descriptor{module = ?MODULE}, _, _) ->
     {error, einval}.
 
-
+%% Returns {error, Reason} | ok.
+sendfile(Filedes, DestFD,  Offset, Bytes) ->
+    #file_descriptor{module = ?MODULE, data = {Port, _}}=Filedes,
+    drv_command(Port, <<?FILE_SENDFILE, DestFD:32, Offset:64, Bytes:64>>).
 
 %% Returns {error, Reason} | ok.
 sync(#file_descriptor{module = ?MODULE, data = {Port, _}}) ->


More information about the erlang-patches mailing list