[erlang-questions] comment on my erlang Spamfilter

Thu Jul 24 14:48:51 CEST 2008

On Thu, Jul 24, 2008 at 1:35 PM, Lev Walkin <vlm@REDACTED> wrote:

> hask ellian wrote:
> > I made a simple spamfilter in Erlang. It takes 2 files with previous
> > spam and good emails and then counts how many times the most frequent
> > words from the spammy emails and the good emails occurs and then
> > calculates the quote spam/(spam+good) in the file you want to test and
> > returns  a number between 0 and 1.
> > It could easily be improved in numerous ways but the main point for me
> > was to learn Erlang. This isn't exactly what Erlang is for but it s way
> > to get started.
> > I'd be happy to receive comments on the Erlang-ness of the code and
> > improvements.
> > File I/O seems slow, is there a better way? In Haskell it is fairly
> instant.
> >
> >
> > -module(antispam).
> > -export([take/2,count/2,count_all/1,most_common/2,count_set_in_list/2,
> > classify/0,readfile/1]).
> >
> > take(N,List) -> i_take(N,List,0,[]).
> >     i_take(N,List,Count,Acc) ->
> >      if Count < N andalso List /= [] ->
> >          i_take(N,tl(List),Count+1,Acc++[hd(List)]);
> >         Count == N ->
> >          Acc;
> >         true ->
> >          []
> >      end.
>
> Consider using lists:sublist() instead of reimplementing
> the standard library methods.
>
> > count(Tok,List) -> i_count(Tok,List,0).
> >     i_count(Tok,List,Acc) ->
> >         if Tok == hd(List) andalso List /= [] ->
> >             i_count(Tok,tl(List),Acc+1);
> >            Tok /= hd(List) andalso List /= [] ->
> >             i_count(Tok,tl(List),Acc);
> >            true ->
> >             Acc
> >     end.
>
> Consider using
>
>        length([T == Tok || T <- List])

I guess you mean
count(Tok, List) -> length([T || T<-List, T=:=Tok]).

or may be better (less memory consuming)

count(Tok, List) -> lists:foldl(fun(Tok, Acc)->Acc+1; (_,Acc)->Acc end, 0,
List).

but tail recursive version is not so bad at all (guess fastest)

count(Tok, List) -> i_count(Tok, List, 0).
i_count(_, [], Count) -> Count;
i_count(Tok, [Tok|Rest], Count) -> i_count(Tok, Rest, Count+1);
i_count(Tok, [_|Rest], Count) -> i_count(Tok, Rest, Count).

and non tail recursive version can work for no so much big Lists reasonably
well

count(_, [])->0;
count(Tok, [Tok|Rest])->1+count(Tok, Rest);
count(Tok, [_|Rest)->count(Tok, Rest).

>
> which is much more concise and manageable.
>
> > count_all(List) ->
> >     Unique = lists:usort(List),
> >     [{U, count(U, List)} || U <- Unique].
>
> This one has O(N^2) complexity. Consider using ets if the data
> set is larger than, say, 1000 elements.
>
> > count_set_in_list(Set,List) ->
> >     S = [{S, count(S, List)} || S <- Set],
> >     lists:sum(lists:map(fun({H,T}) -> T end, S)).
>
> count_set_in_list(Set, List) ->
>         lists:sum([count(S, List) || S <- Set]).

It is O(N*M) ets or dict should perform better too.

count_set_in_list(Set, List) ->
     lists:foldl(fun(X, D)->dict:update_counter(X, 1, D) end, dict:new(),
List),
     lists:foldl(fun(K, Acc)->
          try dict:fetch(K, CountAll) of
             Count -> Acc+Count
          catch _:_->Acc end
       end, 0, Set).

or less memory consuming and probalbly faster

count_set_in_list(Set, List) ->
   S = sets:from_list(Set);
   lists:foldl(fun(K, Acc) ->
      case sets:is_element(K, S) of
          true -> Acc+1;
          false -> Acc
      end
   end, 0, List).

But for short Set it can be slower than

count_set_in_list(Set, List) ->
   lists:foldl(fun(K, Acc) ->
      case lists:member(K, Set) of
          true -> Acc+1;
          false -> Acc
      end
   end, 0, List).

which can be simplified as again O(N*M)

count_set_in_list(Set, List) ->
       length([ok || X<-List, Y<-Set, X=:=Y]).

>
> > most_common(Stringlist,Xmost) ->
> >     No_preps = lists:filter(fun(X) -> length(X) > 4 end, Stringlist),
> >     Sorted_by_count = lists:keysort(2, count_all(No_preps)),
> >     TakeX = take(Xmost, lists:reverse(Sorted_by_count)),
> >     lists:map(fun({H,T}) -> H end, TakeX).
>
>
>
> > readfile(FileName) ->
> >     {ok, Binary} = file:read_file(FileName),
> >     string:tokens(binary_to_list(Binary), " ").
> >
> > classify() ->
> >     GoodWords =
> > most_common(readfile("C:/Users/saftarn/Desktop/emails/okemails.txt"),
> 20),
> >     BadWords  =
> > most_common(readfile("C:/Users/saftarn/Desktop/emails/spam.txt"), 20),
> >     GoodCount = count_set_in_list(GoodWords,
> > readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
> >     BadCount  = count_set_in_list(BadWords,
> > readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
> >     T = BadCount + GoodCount,
> >     if T /= 0 ->
> >     BadCount / T;
> >        true ->
> >         0.5
> >     end.
> >
> >
> > ------------------------------------------------------------------------
> >
> > _______________________________________________
> > erlang-questions mailing list
> > erlang-questions@REDACTED
> > http://www.erlang.org/mailman/listinfo/erlang-questions
>
> _______________________________________________
> erlang-questions mailing list
> erlang-questions@REDACTED
> http://www.erlang.org/mailman/listinfo/erlang-questions
>

-- 
--Hynek (Pichi) Vychodil
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://erlang.org/pipermail/erlang-questions/attachments/20080724/680acd73/attachment.htm>