[erlang-questions] Fast directory walker

Sergej Jurečko sergej.jurecko@REDACTED
Sat Dec 10 11:57:28 CET 2016


A faster version that returns size of folder.  Pth++"/"++H  is faster than filename:join, and directly calling prim_file is also faster. 

walker1(Path) ->
	case prim_file:list_dir(Path) of
		{ok,L} ->
			walker1(Path,L,0);
		_ ->
			0
	end.
walker1(Pth,["."|T],Sz) ->
	walker1(Pth,T,Sz);
walker1(Pth,[".."|T],Sz) ->
	walker1(Pth,T,Sz);
walker1(Pth,[H|T],Sz) ->
	Nm = Pth++"/"++H,
	case prim_file:read_file_info(Nm) of
		{ok,#file_info{type = regular, size = FS}} ->
			walker1(Pth,T,Sz+FS);
		{ok,#file_info{type = directory}} ->
			case prim_file:list_dir(Nm) of
				{ok,L} ->
					walker1(Pth, T, walker1(Nm,L,Sz));
				_ ->
					walker1(Pth, T, Sz)
			end;
		_ ->
			walker1(Pth,T,Sz)
	end;
walker1(_,[],Sz) ->
	Sz.

> On 10 Dec 2016, at 10:20, Frank Muller <frank.muller.erl@REDACTED> wrote:
> 
> Combining previous hints (Benoit, Sergej):
> 
> -module(directory).
> -include_lib("kernel/include/file.hrl").
> -export([walker/1]).
> 
> walker(Path) ->
>     case file:read_file_info(Path, [raw]) of
>         {ok, #file_info{type = regular}} ->
>             1;
>         _ -> %% not care about symlink for now, assume a directory
>             Children = filelib:wildcard(Path ++ "/*"),
>             lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
> 	end.
> 
> > timer:tc(fun() -> directory:walker("/usr/share") end).
> {1611688, <tel:1611688,28953>28953 <tel:1611688,28953>}
> 
> I'm only counting number of files in this case.
> 
> /Frank
> 
> Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <sergej.jurecko@REDACTED <mailto:sergej.jurecko@REDACTED>> a écrit :
> read_file_info does the job of is_dir and file_size in a single call. That was the intention.
> 
> Also use file:read_file_info(name,[raw])
> 
> 
> Sergej
> 
>> On 10 Dec 2016, at 09:42, Benoit Chesneau <bchesneau@REDACTED <mailto:bchesneau@REDACTED>> wrote:
>> 
>> this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
>> https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257 <https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257>
>> 
>> except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...
>> 
>> On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <sergej.jurecko@REDACTED <mailto:sergej.jurecko@REDACTED>> wrote:
>> Stop using filelib functions. Use file:read_file_info and file:list_dir.
>> 
>> Sergej
>> 
>> On Dec 10, 2016 9:29 AM, "Frank Muller" <frank.muller.erl@REDACTED <mailto:frank.muller.erl@REDACTED>> wrote:
>> Hi Stanislaw
>> 
>> First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.
>> 
>> And you're right, here is a detailed comparison with other scripting languages:
>> 
>> In my /usr/share, there’s:
>> 2580 directories
>> 28953 files
>> 
>> 1. Erlang (no io:format/1, just recurse):
>> 
>> walk(Dir) ->
>>     {ok, Files} = file:list_dir(Dir),
>>     walk(Dir, Files).
>> 
>> walk(Dir, [ Basename | Rest ]) ->
>>     Path = filename:join([ Dir, Basename ]),
>>     case filelib:is_dir(Path) of
>>         true  ->
>>             walk(Path);
>>         false ->
>>           %%  io:format("~s~n", [Path]),
>>             filelib:file_size(Path)
>>     end,
>>     walk(Dir, Rest);
>> walk(_, []) ->
>>     ok.
>> 
>> timer:tc(fun() -> directoy:walker("/usr/share") end).
>> {4662361 <tel:4662361>,ok}
>> 
>> 2. Python (this code even count the size of dir):
>> From: http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python <http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python>
>> 
>> import os
>> def get_size(start_path = '.'):
>>     total_size = 0
>>     for dirpath, dirnames, filenames in os.walk(start_path):
>>         for f in filenames:
>>             fp = os.path.join(dirpath, f)
>>             total_size += os.path.getsize(fp)
>>     return total_size
>> 
>> print get_size()
>> 
>> $ cd /usr/share
>> $ time dir_walker.py
>> 432034130 <tel:432034130>
>> 0.25 real         0.13 user         0.10 sys
>> 
>> 2. Perl (same, count dir size)
>> http://www.perlmonks.org/?node_id=168974 <http://www.perlmonks.org/?node_id=168974>
>> 
>> use File::Find;           
>> my $size = 0;             
>> find(sub { $size += -s if -f $_ }, "/usr/share");
>> 
>> $ time perl dir_walker.pl <http://dir_walker.pl/>
>> 432034130 <tel:432034130>
>> 0.13 real         0.05 user         0.08 sys
>> 
>> 3. Ruby (same, count dir size):
>> 
>> def directory_size(path)
>>   path << '/' unless path.end_with?('/')
>>   raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
>>   total_size = 0
>>   Dir["#{path}**/*"].each do |f|
>>     total_size += File.size(f) if File.file?(f) && File.size?(f)
>>   end
>>   total_size
>> end
>> puts directory_size '/usr/share’
>> 
>> $ time walker.rb
>> 432028422 <tel:432028422>
>> 0.21 real         0.09 user         0.11 sys
>> 
>> 4. Lua:
>> From: http://lua-users.org/wiki/DirTreeIterator <http://lua-users.org/wiki/DirTreeIterator>
>> 
>> require "lfs"
>> 
>> function dirtree(dir)
>>   assert(dir and dir ~= "", "directory parameter is missing or empty")
>>   if string.sub(dir, -1) == "/" then
>>     dir=string.sub(dir, 1, -2)
>>   end
>> 
>>   local function yieldtree(dir)
>>     for entry in lfs.dir(dir) do
>>       if entry ~= "." and entry ~= ".." then
>>         entry=dir.."/"..entry
>> 	local attr=lfs.attributes(entry)
>> 	coroutine.yield(entry,attr)
>> 	if attr.mode == "directory" then
>> 	  yieldtree(entry)
>> 	end
>>       end
>>     end
>>   end
>> 
>>   return coroutine.wrap(function() yieldtree(dir) end)
>> end
>> 
>> for filename, attr in dirtree("/usr/share") do
>>       print(attr.mode, filename)
>> end
>> 
>> $ luarocks install luafilesystem
>> $ time lua walker.lua > /dev/null
>> 0.30 real         0.16 user         0.14 sys
>> 
>> Do you need more?
>> 
>> Thanks for you help.
>> /Frank
>> 
>> Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <erlang.org@REDACTED <mailto:erlang.org@REDACTED>> a écrit :
>> On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:
>> 
>> > I would like to improve the speed of my directory walker.
>> 
>> >
>> 
>> > walk(Dir) ->
>> 
>> >     {ok, Files} = prim_file:list_dir(Dir),
>> 
>> >     walk(Dir, Files).
>> 
>> 
>> 
>> Why prim_file:list_dir() instead of file:list_dir()? The former is
>> 
>> undocumented internal function.
>> 
>> 
>> 
>> [...]
>> 
>> > Compared to almost anything i found on the web, it’s still very slow:
>> 
>> > > timer:tc(fun() -> dir:walk("/usr/share") end).
>> 
>> > {4662361,ok}
>> 
>> 
>> 
>> What is it this "anything you found on the web"? And how did you run
>> 
>> your comparisons? There's a large difference between first and second
>> 
>> consequent run caused by OS' directory cache, and there's large
>> 
>> difference between simply walking through the directory and walking with
>> 
>> printing something to the screen for every file.
>> 
>> 
>> 
>> Then there's also your using filelib:is_dir() and then
>> 
>> filelib:file_size(), which means two stat(2) calls, while you only need
>> 
>> to do it once per file (file:read_file_info()).
>> 
>> 
>> 
>> --
>> 
>> Stanislaw Klekot
>> 
>> 
>> 
>> 
>> _______________________________________________
>> 
>> 
>> erlang-questions mailing list
>> 
>> 
>> erlang-questions@REDACTED <mailto:erlang-questions@REDACTED>
>> 
>> 
>> http://erlang.org/mailman/listinfo/erlang-questions <http://erlang.org/mailman/listinfo/erlang-questions>
>> 
>> 
>> 
>> 
>> 
>> _______________________________________________
>> 
>> 
>> erlang-questions mailing list
>> 
>> 
>> erlang-questions@REDACTED <mailto:erlang-questions@REDACTED>
>> 
>> 
>> http://erlang.org/mailman/listinfo/erlang-questions <http://erlang.org/mailman/listinfo/erlang-questions>
>> 
>> 
>> 
>> 
> 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://erlang.org/pipermail/erlang-questions/attachments/20161210/bcc52206/attachment.htm>


More information about the erlang-questions mailing list