[erlang-questions] Fast directory walker

Sat Dec 10 11:33:33 CET 2016

Out of interest, what do you get when you run the same benchmark twice in succession?

The buffer cache would play a role on the second and subsequent attempts. For your code below, running on my oldish laptop, I get {35667060,158949} and {8606920,158949}.

You can use "blockdev --flushbufs <device>" to ensure the buffers are clean before running the benchmarks, especially when comparing different versions and languages.

Having said that, the erlang version does look slow, especially when compared with the shell equivalent "time find /usr/share -print | wc -l"

$ sudo blockdev --flushbufs /dev/...

$ time find /usr/share -print | wc -l
186911

real	0m32.446s
user	0m0.796s
sys	0m1.808s

$ time find /usr/share -print | wc -l
186911

real	0m0.336s
user	0m0.152s
sys	0m0.200s

Perhaps there is room for improvement within the library itself!

Cheers,
f.

On 10/12/16 09:20, Frank Muller wrote:
> Combining previous hints (Benoit, Sergej):
>
> -module(directory).
> -include_lib("kernel/include/file.hrl").
> -export([walker/1]).
>
> walker(Path) ->
>     case file:read_file_info(Path, [raw]) of
>         {ok, #file_info{type = regular}} ->
>             1;
>         _ -> %% not care about symlink for now, assume a directory
>             Children = filelib:wildcard(Path ++ "/*"),
>             lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
> end.
>
>> timer:tc(fun() -> directory:walker("/usr/share") end).
> {1611688, <tel:1611688,28953>/28953 <tel:1611688,28953>/}
>
> I'm only counting number of files in this case.
>
> /Frank
>
> Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <sergej.jurecko@REDACTED <mailto:sergej.jurecko@REDACTED>> a écrit :
>
>     read_file_info does the job of is_dir and file_size in a single call. That was the intention.
>
>     Also use file:read_file_info(name,[raw])
>
>
>     Sergej
>
>>     On 10 Dec 2016, at 09:42, Benoit Chesneau <bchesneau@REDACTED <mailto:bchesneau@REDACTED>> wrote:
>>
>>     this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
>>     https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257
>>
>>     except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...
>>
>>     On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <sergej.jurecko@REDACTED <mailto:sergej.jurecko@REDACTED>> wrote:
>>
>>         Stop using filelib functions. Use file:read_file_info and file:list_dir.
>>
>>         Sergej
>>
>>         On Dec 10, 2016 9:29 AM, "Frank Muller" <frank.muller.erl@REDACTED <mailto:frank.muller.erl@REDACTED>> wrote:
>>
>>             Hi Stanislaw
>>
>>             First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.
>>
>>             And you're right, here is a detailed comparison with other scripting languages:
>>
>>             In my /usr/share, there’s:
>>             2580 directories
>>             28953 files
>>
>>             1. Erlang (no io:format/1, just recurse):
>>
>>             walk(Dir) ->
>>                 {ok, Files} = file:list_dir(Dir),
>>                 walk(Dir, Files).
>>
>>             walk(Dir, [ Basename | Rest ]) ->
>>                 Path = filename:join([ Dir, Basename ]),
>>                 case filelib:is_dir(Path) of
>>                     true  ->
>>                         walk(Path);
>>                     false ->
>>                       %%  io:format("~s~n", [Path]),
>>                         filelib:file_size(Path)
>>                 end,
>>                 walk(Dir, Rest);
>>             walk(_, []) ->
>>                 ok.
>>
>>             timer:tc(fun() -> directoy:walker("/usr/share") end).
>>             {4662361 <tel:4662361>,ok}
>>
>>             2. Python (this code even count the size of dir):
>>             From: http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python
>>
>>             import os
>>             def get_size(start_path = '.'):
>>                 total_size = 0
>>                 for dirpath, dirnames, filenames in os.walk(start_path):
>>                     for f in filenames:
>>                         fp = os.path.join(dirpath, f)
>>                         total_size += os.path.getsize(fp)
>>                 return total_size
>>
>>             print get_size()
>>
>>             $ cd /usr/share
>>             $ time dir_walker.py
>>             432034130 <tel:432034130>
>>             0.25 real         0.13 user         0.10 sys
>>
>>             2. Perl (same, count dir size)
>>             http://www.perlmonks.org/?node_id=168974
>>
>>             use File::Find;
>>             my $size = 0;
>>             find(sub { $size += -s if -f $_ }, "/usr/share");
>>
>>             $ time perl dir_walker.pl <http://dir_walker.pl/>
>>             432034130 <tel:432034130>
>>             0.13 real         0.05 user         0.08 sys
>>
>>             3. Ruby (same, count dir size):
>>
>>             def directory_size(path)
>>               path << '/' unless path.end_with?('/')
>>               raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
>>               total_size = 0
>>               Dir["#{path}**/*"].each do |f|
>>                 total_size += File.size(f) if File.file?(f) && File.size?(f)
>>               end
>>               total_size
>>             end
>>             puts directory_size '/usr/share’
>>
>>             $ time walker.rb
>>             432028422 <tel:432028422>
>>             0.21 real         0.09 user         0.11 sys
>>
>>             4. Lua:
>>             From: http://lua-users.org/wiki/DirTreeIterator
>>
>>             require "lfs"
>>
>>             function dirtree(dir)
>>               assert(dir and dir ~= "", "directory parameter is missing or empty")
>>               if string.sub(dir, -1) == "/" then
>>                 dir=string.sub(dir, 1, -2)
>>               end
>>
>>               local function yieldtree(dir)
>>                 for entry in lfs.dir(dir) do
>>                   if entry ~= "." and entry ~= ".." then
>>                     entry=dir.."/"..entry
>>             local attr=lfs.attributes(entry)
>>             coroutine.yield(entry,attr)
>>             if attr.mode == "directory" then
>>               yieldtree(entry)
>>             end
>>                   end
>>                 end
>>               end
>>
>>               return coroutine.wrap(function() yieldtree(dir) end)
>>             end
>>
>>             for filename, attr in dirtree("/usr/share") do
>>                   print(attr.mode, filename)
>>             end
>>
>>             $ luarocks install luafilesystem
>>             $ time lua walker.lua > /dev/null
>>             0.30 real         0.16 user         0.14 sys
>>
>>             Do you need more?
>>
>>             Thanks for you help.
>>             /Frank
>>
>>             Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <erlang.org@REDACTED <mailto:erlang.org@REDACTED>> a écrit :
>>
>>                 On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:
>>
>>                 > I would like to improve the speed of my directory walker.
>>
>>                 >
>>
>>                 > walk(Dir) ->
>>
>>                 >     {ok, Files} = prim_file:list_dir(Dir),
>>
>>                 >     walk(Dir, Files).
>>
>>
>>
>>                 Why prim_file:list_dir() instead of file:list_dir()? The former is
>>
>>                 undocumented internal function.
>>
>>
>>
>>                 [...]
>>
>>                 > Compared to almost anything i found on the web, it’s still very slow:
>>
>>                 > > timer:tc(fun() -> dir:walk("/usr/share") end).
>>
>>                 > {4662361,ok}
>>
>>
>>
>>                 What is it this "anything you found on the web"? And how did you run
>>
>>                 your comparisons? There's a large difference between first and second
>>
>>                 consequent run caused by OS' directory cache, and there's large
>>
>>                 difference between simply walking through the directory and walking with
>>
>>                 printing something to the screen for every file.
>>
>>
>>
>>                 Then there's also your using filelib:is_dir() and then
>>
>>                 filelib:file_size(), which means two stat(2) calls, while you only need
>>
>>                 to do it once per file (file:read_file_info()).
>>
>>
>>
>>                 --
>>
>>                 Stanislaw Klekot
>>
>>