[erlang-questions] comment on my erlang Spamfilter

Thu Jul 24 13:35:23 CEST 2008

hask ellian wrote:
> I made a simple spamfilter in Erlang. It takes 2 files with previous 
> spam and good emails and then counts how many times the most frequent 
> words from the spammy emails and the good emails occurs and then 
> calculates the quote spam/(spam+good) in the file you want to test and 
> returns  a number between 0 and 1.
> It could easily be improved in numerous ways but the main point for me 
> was to learn Erlang. This isn't exactly what Erlang is for but it s way 
> to get started.
> I'd be happy to receive comments on the Erlang-ness of the code and 
> improvements.
> File I/O seems slow, is there a better way? In Haskell it is fairly instant.
> 
> 
> -module(antispam).
> -export([take/2,count/2,count_all/1,most_common/2,count_set_in_list/2,
> classify/0,readfile/1]).
> 
> take(N,List) -> i_take(N,List,0,[]).
>     i_take(N,List,Count,Acc) ->
>      if Count < N andalso List /= [] ->
>          i_take(N,tl(List),Count+1,Acc++[hd(List)]);
>         Count == N ->
>          Acc;
>         true ->
>          []
>      end.

Consider using lists:sublist() instead of reimplementing
the standard library methods.

> count(Tok,List) -> i_count(Tok,List,0).  
>     i_count(Tok,List,Acc) ->
>         if Tok == hd(List) andalso List /= [] ->
>             i_count(Tok,tl(List),Acc+1);
>            Tok /= hd(List) andalso List /= [] ->
>             i_count(Tok,tl(List),Acc);
>            true ->
>             Acc
>     end.

Consider using

	length([T == Tok || T <- List])

which is much more concise and manageable.

> count_all(List) ->
>     Unique = lists:usort(List),
>     [{U, count(U, List)} || U <- Unique].

This one has O(N^2) complexity. Consider using ets if the data
set is larger than, say, 1000 elements.

> count_set_in_list(Set,List) ->
>     S = [{S, count(S, List)} || S <- Set],
>     lists:sum(lists:map(fun({H,T}) -> T end, S)).

count_set_in_list(Set, List) ->
	lists:sum([count(S, List) || S <- Set]).

> most_common(Stringlist,Xmost) ->
>     No_preps = lists:filter(fun(X) -> length(X) > 4 end, Stringlist),
>     Sorted_by_count = lists:keysort(2, count_all(No_preps)),
>     TakeX = take(Xmost, lists:reverse(Sorted_by_count)),
>     lists:map(fun({H,T}) -> H end, TakeX).

> readfile(FileName) ->
>     {ok, Binary} = file:read_file(FileName),
>     string:tokens(binary_to_list(Binary), " ").
> 
> classify() ->
>     GoodWords = 
> most_common(readfile("C:/Users/saftarn/Desktop/emails/okemails.txt"), 20),
>     BadWords  = 
> most_common(readfile("C:/Users/saftarn/Desktop/emails/spam.txt"), 20),
>     GoodCount = count_set_in_list(GoodWords, 
> readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
>     BadCount  = count_set_in_list(BadWords,  
> readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
>     T = BadCount + GoodCount,
>     if T /= 0 ->
>     BadCount / T;
>        true ->
>         0.5
>     end.
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> erlang-questions mailing list
> erlang-questions@REDACTED
> http://www.erlang.org/mailman/listinfo/erlang-questions