%% @copyright 2007 Hynek Vychodil %% @author Hynek Vychodil %% [http://pichis_blog.blogspot.com/] %% @version 0.0.1 %% @end %% ===================================================================== %% @doc Single node map-reduce and fold-reduce for file like data sources %% -module(wf_pichi1). -export([start_blkts/1, start_anders/1]). start_blkts(FileName) -> {ok, F} = nlt_reader:open(FileName), Reader = fun () -> nlt_reader:read(F) end, T = ets:new(wft, [public, set, {keypos, 1}]), Map = fun (B) -> scan_blkts(B, T) end, Reduce = fun (_, _) -> none end, file_map_reduce:map_reduce(Reader, {Map, Reduce}), lists:foreach(fun ({K, V}) -> io:format("~p: ~s~n", [V, K]) end, top_ten(T)). start_anders(FileName) -> {ok, F} = nlt_reader:open(FileName), Reader = fun () -> nlt_reader:read(F) end, T = ets:new(wft, [public, set, {keypos, 1}]), Tbl = init(), Map = fun (B) -> find(B, Tbl, T) end, Reduce = fun (_, _) -> none end, file_map_reduce:map_reduce(Reader, {Map, Reduce}), lists:foreach(fun ({K, V}) -> io:format("~p: ~s~n", [V, K]) end, top_ten(T)). top_ten(Tab) -> TopTen = fun (M, Acc) when length(Acc) < 10 -> [M | Acc]; ({_, X} = E, [{_, C} | Es]) when X > C -> lists:keysort(2, [E | Es]); (_E, Acc) -> Acc end, lists:reverse(ets:foldl(TopTen, [], Tab)). scan_blkts(Bin, Tab) -> scan_blkts(Bin, 0, size(Bin) - 33, Tab). scan_blkts(B, N, S, T) when N < S -> case B of <<_:N/binary, "GET /ongoing/When/", _, _, _, $x, $/, _, _, _, _, $/, _, _, $/, _, _, $/, _/binary>> -> case scan_key(B, N + 33, S + 33) of {none, F} -> scan_blkts(B, F, S, T); {ok, F} -> N1 = N + 23, L = F - N1, <<_:N1/binary, Key:L/binary, Rest/binary>> = B, safe_add_key(Key, T), scan_blkts(B, F, S, T) end; _ -> scan_blkts(B, N + 1, S, T) end; scan_blkts(_, _, _, _) -> ok. scan_key(B, N, S) when N < S -> <<_:N/binary, C, _/binary>> = B, case C of $\s -> {ok, N}; $. -> {none, N}; $\n -> {none, N}; _ -> scan_key(B, N + 1, S) end; scan_key(_, N, _) -> {none, N}. safe_add_key(Key, T)-> try ets:update_counter(T, Key, 1) catch error:_ -> case ets:insert_new(T, {Key, 1}) of true -> ok; false -> ets:update_counter(T, Key, 1) end end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Based on Anders Nygren's () wfbm4_ets1 % Changes: % - message passing removed % - ets:update_counter race condition solved by insert_new % and assume nobody deletes keys %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -define(STR, "] \"GET /ongoing/When/"). -define(REVSTR, "/nehW/gniogno/ TEG\" ]"). -define(STRLEN, 21). %length(?STR) -define(DATELEN, 16). %length("200x/2000/00/00/") -define(MATCHHEADLEN, 5). %length("200x/") set_shifts(_, Count, Tbl) when Count =:= ?STRLEN - 1 -> Tbl; set_shifts([H|T], Count, Tbl) -> Shift = ?STRLEN - Count - 1, set_shifts(T, Count+1, dict:store(H, Shift, Tbl)). set_defaults([], Tbl) -> Tbl; set_defaults([V|T], Tbl) -> set_defaults(T, dict:store(V, ?STRLEN, Tbl)). init() -> D = set_shifts(?STR, 0, set_defaults(lists:seq(1, 255), dict:new())), list_to_tuple([S||{_C,S} <-lists:sort(dict:to_list(D))]). check_for_dot_or_space(Bin) -> check_for_dot_or_space(Bin, 0). check_for_dot_or_space(<<$ , _/binary>>, 0) -> {nomatch, 0}; check_for_dot_or_space(Bin, Len) -> case Bin of <<_:Len/binary, $ , _/binary>> -> <> =Bin, {ok, Front}; <<_:Len/binary, $., _/binary>> -> {nomatch, Len}; _ -> check_for_dot_or_space(Bin, Len+1) end. get_tail(<<>>) -> nomatch; get_tail(<<_:3/binary,"x/",_:4/binary,$/,_:2/binary,$/,_:2/binary,$/,Tail/binary>> = Bin) -> case check_for_dot_or_space(Tail) of {ok, Match} -> Size = 11+size(Match), <<_:5/binary,FullMatch:Size/binary,_/binary>> = Bin, {ok, FullMatch}; {nomatch, Skip} -> {skip, ?DATELEN + Skip} end; get_tail(_) -> nomatch. match_front(_, -1, _, _, _) -> {true, 0}; match_front(Bin, Len, [C1|T], Comps, Tbl) -> <<_:Len/binary, C2:8, _/binary>> = Bin, case C1 of C2 -> match_front(Bin, Len-1, T, Comps+1, Tbl); _ -> case element(C2, Tbl) of ?STRLEN -> {false, ?STRLEN}; Shift when Comps >= Shift -> {false, 1}; Shift -> {false, Shift-Comps} end end. find(Bin, Tbl, Tab) -> find(Bin, Tbl, Tab, 0). find(Bin, _, _Tab, _N) when size(Bin) =< ?STRLEN -> done; find(Bin, Tbl, Tab, N) -> <> = Bin, case match_front(Front, ?STRLEN-1, ?REVSTR, 0, Tbl) of {false, Shift} -> <<_:Shift/binary, Next/binary>> = Bin, find(Next, Tbl, Tab, N); {true, _} -> <<_:?STRLEN/binary, Tail/binary>> = Bin, case get_tail(Tail) of {ok, Match} -> safe_add_key(Match, Tab), Len = size(Match) + ?MATCHHEADLEN, <<_:Len/binary, Rest/binary>> = Tail, find(Rest, Tbl, Tab, N+1); {skip, Skip} -> <<_:Skip/binary, More/binary>> = Tail, find(More, Tbl, Tab, N+1); nomatch -> find(Tail, Tbl, Tab, N) end end.