[erlang-questions] comment on my erlang Spamfilter

hask ellian <>
Thu Jul 24 12:58:54 CEST 2008


I made a simple spamfilter in Erlang. It takes 2 files with previous spam
and good emails and then counts how many times the most frequent words from
the spammy emails and the good emails occurs and then calculates the quote
spam/(spam+good) in the file you want to test and returns  a number between
0 and 1.
It could easily be improved in numerous ways but the main point for me was
to learn Erlang. This isn't exactly what Erlang is for but it s way to get
started.
I'd be happy to receive comments on the Erlang-ness of the code and
improvements.
File I/O seems slow, is there a better way? In Haskell it is fairly instant.


-module(antispam).
-export([take/2,count/2,count_all/1,most_common/2,count_set_in_list/2,
classify/0,readfile/1]).

take(N,List) -> i_take(N,List,0,[]).
    i_take(N,List,Count,Acc) ->
     if Count < N andalso List /= [] ->
         i_take(N,tl(List),Count+1,Acc++[hd(List)]);
        Count == N ->
         Acc;
        true ->
         []
     end.

count(Tok,List) -> i_count(Tok,List,0).
    i_count(Tok,List,Acc) ->
        if Tok == hd(List) andalso List /= [] ->
            i_count(Tok,tl(List),Acc+1);
           Tok /= hd(List) andalso List /= [] ->
            i_count(Tok,tl(List),Acc);
           true ->
            Acc
    end.

count_all(List) ->
    Unique = lists:usort(List),
    [{U, count(U, List)} || U <- Unique].

count_set_in_list(Set,List) ->
    S = [{S, count(S, List)} || S <- Set],
    lists:sum(lists:map(fun({H,T}) -> T end, S)).

most_common(Stringlist,Xmost) ->
    No_preps = lists:filter(fun(X) -> length(X) > 4 end, Stringlist),
    Sorted_by_count = lists:keysort(2, count_all(No_preps)),
    TakeX = take(Xmost, lists:reverse(Sorted_by_count)),
    lists:map(fun({H,T}) -> H end, TakeX).

readfile(FileName) ->
    {ok, Binary} = file:read_file(FileName),
    string:tokens(binary_to_list(Binary), " ").

classify() ->
    GoodWords =
most_common(readfile("C:/Users/saftarn/Desktop/emails/okemails.txt"), 20),
    BadWords  =
most_common(readfile("C:/Users/saftarn/Desktop/emails/spam.txt"), 20),
    GoodCount = count_set_in_list(GoodWords,
readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
    BadCount  = count_set_in_list(BadWords,
readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
    T = BadCount + GoodCount,
    if T /= 0 ->
    BadCount / T;
       true ->
        0.5
    end.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://erlang.org/pipermail/erlang-questions/attachments/20080724/3a12929d/attachment.html>


More information about the erlang-questions mailing list