[erlang-questions] Fast directory walker
Sergej Jurečko
sergej.jurecko@REDACTED
Sat Dec 10 11:57:28 CET 2016
A faster version that returns size of folder. Pth++"/"++H is faster than filename:join, and directly calling prim_file is also faster.
walker1(Path) ->
case prim_file:list_dir(Path) of
{ok,L} ->
walker1(Path,L,0);
_ ->
0
end.
walker1(Pth,["."|T],Sz) ->
walker1(Pth,T,Sz);
walker1(Pth,[".."|T],Sz) ->
walker1(Pth,T,Sz);
walker1(Pth,[H|T],Sz) ->
Nm = Pth++"/"++H,
case prim_file:read_file_info(Nm) of
{ok,#file_info{type = regular, size = FS}} ->
walker1(Pth,T,Sz+FS);
{ok,#file_info{type = directory}} ->
case prim_file:list_dir(Nm) of
{ok,L} ->
walker1(Pth, T, walker1(Nm,L,Sz));
_ ->
walker1(Pth, T, Sz)
end;
_ ->
walker1(Pth,T,Sz)
end;
walker1(_,[],Sz) ->
Sz.
> On 10 Dec 2016, at 10:20, Frank Muller <frank.muller.erl@REDACTED> wrote:
>
> Combining previous hints (Benoit, Sergej):
>
> -module(directory).
> -include_lib("kernel/include/file.hrl").
> -export([walker/1]).
>
> walker(Path) ->
> case file:read_file_info(Path, [raw]) of
> {ok, #file_info{type = regular}} ->
> 1;
> _ -> %% not care about symlink for now, assume a directory
> Children = filelib:wildcard(Path ++ "/*"),
> lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
> end.
>
> > timer:tc(fun() -> directory:walker("/usr/share") end).
> {1611688, <tel:1611688,28953>28953 <tel:1611688,28953>}
>
> I'm only counting number of files in this case.
>
> /Frank
>
> Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <sergej.jurecko@REDACTED <mailto:sergej.jurecko@REDACTED>> a écrit :
> read_file_info does the job of is_dir and file_size in a single call. That was the intention.
>
> Also use file:read_file_info(name,[raw])
>
>
> Sergej
>
>> On 10 Dec 2016, at 09:42, Benoit Chesneau <bchesneau@REDACTED <mailto:bchesneau@REDACTED>> wrote:
>>
>> this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
>> https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257 <https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257>
>>
>> except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...
>>
>> On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <sergej.jurecko@REDACTED <mailto:sergej.jurecko@REDACTED>> wrote:
>> Stop using filelib functions. Use file:read_file_info and file:list_dir.
>>
>> Sergej
>>
>> On Dec 10, 2016 9:29 AM, "Frank Muller" <frank.muller.erl@REDACTED <mailto:frank.muller.erl@REDACTED>> wrote:
>> Hi Stanislaw
>>
>> First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.
>>
>> And you're right, here is a detailed comparison with other scripting languages:
>>
>> In my /usr/share, there’s:
>> 2580 directories
>> 28953 files
>>
>> 1. Erlang (no io:format/1, just recurse):
>>
>> walk(Dir) ->
>> {ok, Files} = file:list_dir(Dir),
>> walk(Dir, Files).
>>
>> walk(Dir, [ Basename | Rest ]) ->
>> Path = filename:join([ Dir, Basename ]),
>> case filelib:is_dir(Path) of
>> true ->
>> walk(Path);
>> false ->
>> %% io:format("~s~n", [Path]),
>> filelib:file_size(Path)
>> end,
>> walk(Dir, Rest);
>> walk(_, []) ->
>> ok.
>>
>> timer:tc(fun() -> directoy:walker("/usr/share") end).
>> {4662361 <tel:4662361>,ok}
>>
>> 2. Python (this code even count the size of dir):
>> From: http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python <http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python>
>>
>> import os
>> def get_size(start_path = '.'):
>> total_size = 0
>> for dirpath, dirnames, filenames in os.walk(start_path):
>> for f in filenames:
>> fp = os.path.join(dirpath, f)
>> total_size += os.path.getsize(fp)
>> return total_size
>>
>> print get_size()
>>
>> $ cd /usr/share
>> $ time dir_walker.py
>> 432034130 <tel:432034130>
>> 0.25 real 0.13 user 0.10 sys
>>
>> 2. Perl (same, count dir size)
>> http://www.perlmonks.org/?node_id=168974 <http://www.perlmonks.org/?node_id=168974>
>>
>> use File::Find;
>> my $size = 0;
>> find(sub { $size += -s if -f $_ }, "/usr/share");
>>
>> $ time perl dir_walker.pl <http://dir_walker.pl/>
>> 432034130 <tel:432034130>
>> 0.13 real 0.05 user 0.08 sys
>>
>> 3. Ruby (same, count dir size):
>>
>> def directory_size(path)
>> path << '/' unless path.end_with?('/')
>> raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
>> total_size = 0
>> Dir["#{path}**/*"].each do |f|
>> total_size += File.size(f) if File.file?(f) && File.size?(f)
>> end
>> total_size
>> end
>> puts directory_size '/usr/share’
>>
>> $ time walker.rb
>> 432028422 <tel:432028422>
>> 0.21 real 0.09 user 0.11 sys
>>
>> 4. Lua:
>> From: http://lua-users.org/wiki/DirTreeIterator <http://lua-users.org/wiki/DirTreeIterator>
>>
>> require "lfs"
>>
>> function dirtree(dir)
>> assert(dir and dir ~= "", "directory parameter is missing or empty")
>> if string.sub(dir, -1) == "/" then
>> dir=string.sub(dir, 1, -2)
>> end
>>
>> local function yieldtree(dir)
>> for entry in lfs.dir(dir) do
>> if entry ~= "." and entry ~= ".." then
>> entry=dir.."/"..entry
>> local attr=lfs.attributes(entry)
>> coroutine.yield(entry,attr)
>> if attr.mode == "directory" then
>> yieldtree(entry)
>> end
>> end
>> end
>> end
>>
>> return coroutine.wrap(function() yieldtree(dir) end)
>> end
>>
>> for filename, attr in dirtree("/usr/share") do
>> print(attr.mode, filename)
>> end
>>
>> $ luarocks install luafilesystem
>> $ time lua walker.lua > /dev/null
>> 0.30 real 0.16 user 0.14 sys
>>
>> Do you need more?
>>
>> Thanks for you help.
>> /Frank
>>
>> Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <erlang.org@REDACTED <mailto:erlang.org@REDACTED>> a écrit :
>> On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:
>>
>> > I would like to improve the speed of my directory walker.
>>
>> >
>>
>> > walk(Dir) ->
>>
>> > {ok, Files} = prim_file:list_dir(Dir),
>>
>> > walk(Dir, Files).
>>
>>
>>
>> Why prim_file:list_dir() instead of file:list_dir()? The former is
>>
>> undocumented internal function.
>>
>>
>>
>> [...]
>>
>> > Compared to almost anything i found on the web, it’s still very slow:
>>
>> > > timer:tc(fun() -> dir:walk("/usr/share") end).
>>
>> > {4662361,ok}
>>
>>
>>
>> What is it this "anything you found on the web"? And how did you run
>>
>> your comparisons? There's a large difference between first and second
>>
>> consequent run caused by OS' directory cache, and there's large
>>
>> difference between simply walking through the directory and walking with
>>
>> printing something to the screen for every file.
>>
>>
>>
>> Then there's also your using filelib:is_dir() and then
>>
>> filelib:file_size(), which means two stat(2) calls, while you only need
>>
>> to do it once per file (file:read_file_info()).
>>
>>
>>
>> --
>>
>> Stanislaw Klekot
>>
>>
>>
>>
>> _______________________________________________
>>
>>
>> erlang-questions mailing list
>>
>>
>> erlang-questions@REDACTED <mailto:erlang-questions@REDACTED>
>>
>>
>> http://erlang.org/mailman/listinfo/erlang-questions <http://erlang.org/mailman/listinfo/erlang-questions>
>>
>>
>>
>>
>>
>> _______________________________________________
>>
>>
>> erlang-questions mailing list
>>
>>
>> erlang-questions@REDACTED <mailto:erlang-questions@REDACTED>
>>
>>
>> http://erlang.org/mailman/listinfo/erlang-questions <http://erlang.org/mailman/listinfo/erlang-questions>
>>
>>
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://erlang.org/pipermail/erlang-questions/attachments/20161210/bcc52206/attachment.htm>
More information about the erlang-questions
mailing list