[erlang-questions] Speeding up text file I/O
Dmitrii 'Mamut' Dimandt
dmitriid@REDACTED
Mon Jan 7 15:20:19 CET 2008
Florian Weimer wrote:
> Is there a way to read lines from a text files more quickly than the
> excerpt below? This runs over 100 times slower than a "wc -l":
>
>
As a matter of fact, there is. It is possible to do up to 20 MB/s using
Erlang only.
Here's the original post: http://gaperton.livejournal.com/11736.html
It's in Russian, so here's a quick recap:
You can use the raw option to get access to the file driver directly.
And if you use binaries, nothing gets copied/freed/copied/freed etc.
So, we are going to use a file in raw mode using block of given length.
First we'll write a function that'll search for a symbol in a binary
real fast. Here it is:
find_8( Buffer, Char ) -> find_8( Buffer, Char, 0 ).
find_8( Buffer, Char, Pos ) ->
case Buffer of
<< _:Pos/bytes, Char:8, _/bytes >> -> Pos;
<< _:Pos/bytes, _:1/bytes, Char:8, _/bytes >> -> Pos + 1;
<< _:Pos/bytes, _:2/bytes, Char:8, _/bytes >> -> Pos + 2;
<< _:Pos/bytes, _:3/bytes, Char:8, _/bytes >> -> Pos + 3;
<< _:Pos/bytes, _:4/bytes, Char:8, _/bytes >> -> Pos + 4;
<< _:Pos/bytes, _:5/bytes, Char:8, _/bytes >> -> Pos + 5;
<< _:Pos/bytes, _:6/bytes, Char:8, _/bytes >> -> Pos + 6;
<< _:Pos/bytes, _:7/bytes, Char:8, _/bytes >> -> Pos + 7;
<< _:Pos/bytes, _:8/bytes, Char:8, _/bytes >> -> Pos + 8;
<< _:Pos/bytes, _:9/bytes, Char:8, _/bytes >> -> Pos + 9;
<< _:Pos/bytes, _:10/bytes, Char:8, _/bytes >> -> Pos + 10;
<< _:Pos/bytes, _:11/bytes, Char:8, _/bytes >> -> Pos + 11;
<< _:Pos/bytes, _:12/bytes, Char:8, _/bytes >> -> Pos + 12;
<< _:Pos/bytes, _:13/bytes, Char:8, _/bytes >> -> Pos + 13;
<< _:Pos/bytes, _:14/bytes, Char:8, _/bytes >> -> Pos + 14;
<< _:Pos/bytes, _:15/bytes, Char:8, _/bytes >> -> Pos + 15;
<< _:Pos/bytes, _:16/bytes, Char:8, _/bytes >> -> Pos + 16;
<< _:Pos/bytes, _:17/bytes, Char:8, _/bytes >> -> Pos + 17;
<< _:Pos/bytes, _:18/bytes, Char:8, _/bytes >> -> Pos + 18;
<< _:Pos/bytes, _:19/bytes, Char:8, _/bytes >> -> Pos + 19;
<< _:Pos/bytes, _:20/bytes, Char:8, _/bytes >> -> Pos + 20;
<< _:Pos/bytes, _:21/bytes, Char:8, _/bytes >> -> Pos + 21;
<< _:Pos/bytes, _:22/bytes, Char:8, _/bytes >> -> Pos + 22;
<< _:Pos/bytes, _:23/bytes, Char:8, _/bytes >> -> Pos + 23;
<< _:Pos/bytes, _:24/bytes, Char:8, _/bytes >> -> Pos + 24;
<< _:Pos/bytes, _:25/bytes, Char:8, _/bytes >> -> Pos + 25;
<< _:Pos/bytes, _:26/bytes, Char:8, _/bytes >> -> Pos + 26;
<< _:Pos/bytes, _:27/bytes, Char:8, _/bytes >> -> Pos + 27;
<< _:Pos/bytes, _:28/bytes, Char:8, _/bytes >> -> Pos + 28;
<< _:Pos/bytes, _:29/bytes, Char:8, _/bytes >> -> Pos + 29;
<< _:Pos/bytes, _:30/bytes, Char:8, _/bytes >> -> Pos + 30;
<< _:Pos/bytes, _:31/bytes, Char:8, _/bytes >> -> Pos + 31;
<< _:Pos/bytes, _:32/bytes, _/bytes >> -> find_8( Buffer, Char,
Pos + 32 );
_ -> not_found
end.
Scary, huh? I've unwound the loop so that the compiler could generate a
long piece of native code, with optimal arranging of the pattern
matches. This wotks well with only 8 lines, but for me an extra 10% of
performance is a big win.
Let us now create a function that splits a binary on a given character:
%% split_char( binary(), byte() ) -> { binary(), binary() } | not_found
split_char( Buffer, Char ) ->
case find_8( Buffer, Char, 0 ) of
not_found -> not_found;
Pos ->
<< Before:Pos/bytes, _:8, After/bytes >> = Buffer,
{ Before, After }
end.
we're almost ready to go on with implementing a get_lines of our own.
It's getting really interesting. Hardcore Erlang:
%% file_reader( File, Len ) -> Handle
%% Handle = { NextF, binary() } | eof
%% NextF = fun() -> Handle
file_reader( File, Len ) -> file_reader( File, Len, << >> ).
file_reader( File, LenI, BufferB ) ->
NextF = fun() ->
case file:read( File, LenI ) of
{ ok, DataB } -> file_reader( File, LenI, DataB );
eof -> eof
end
end,
{ NextF, BufferB }.
This function returns an "iterator". An iterator to the data blocks
(since we are reading our data in blocks). This returns a tuple, whose
first element is the current block and first element is a function that
takes no args but simply returns the next block when called.
Now, here comes our get_line function:
get_line( { NextF, BufferB } ) ->
case split_char( BufferB, 10 ) of
{ LineB, RestB } -> { { NextF, RestB }, LineB };
not_found ->
case NextF() of
eof -> { eof, BufferB };
Handl_1 ->
{ Handl_2, LineB } = get_line( Handl_1 ),
{ Handl_2, << BufferB/bytes, LineB/bytes >> }
end
end.
That's it. The funny thing is that the get_line function we got knows
nothing about files. It knows that it needs to call the function to get
the next block. We can actually plug in any source of data we want.
Let's test the function on a 821-megabyte file on a iMac G5 1,9 GHz:
tf( Name, Len ) ->
{ ok, Fl } = file:open( Name, [ read, raw, binary ] ),
tf_loop( file_reader( Fl, Len ) ),
file:close( Fl ).
%% where
tf_loop( eof ) -> done;
tf_loop( Hnd ) ->
{ Hnd_2, _ } = get_line( Hnd ),
tf_loop( Hnd_2 ).
Buffer size (bytes) 256 1024 4096 8192 16384
default 91,3 53,6 44,1 42,1 41,7
read_ahead 60,6 44,9 42,1 41,2 40,9
async 265,2 102,1 59,2 46,9 44,6
async, read_ahead 59,7 45,7 42,6 41,7 40,3
default means no async threads or read ahead. You should use read ahead
with async threads when doing active i/o. You should also keep the
buffer size above 1 kilobyte. 4 kb should be sufficient for most tasks
More information about the erlang-questions
mailing list