[erlang-questions] comment on my erlang Spamfilter
Lev Walkin
vlm@REDACTED
Thu Jul 24 13:35:23 CEST 2008
hask ellian wrote:
> I made a simple spamfilter in Erlang. It takes 2 files with previous
> spam and good emails and then counts how many times the most frequent
> words from the spammy emails and the good emails occurs and then
> calculates the quote spam/(spam+good) in the file you want to test and
> returns a number between 0 and 1.
> It could easily be improved in numerous ways but the main point for me
> was to learn Erlang. This isn't exactly what Erlang is for but it s way
> to get started.
> I'd be happy to receive comments on the Erlang-ness of the code and
> improvements.
> File I/O seems slow, is there a better way? In Haskell it is fairly instant.
>
>
> -module(antispam).
> -export([take/2,count/2,count_all/1,most_common/2,count_set_in_list/2,
> classify/0,readfile/1]).
>
> take(N,List) -> i_take(N,List,0,[]).
> i_take(N,List,Count,Acc) ->
> if Count < N andalso List /= [] ->
> i_take(N,tl(List),Count+1,Acc++[hd(List)]);
> Count == N ->
> Acc;
> true ->
> []
> end.
Consider using lists:sublist() instead of reimplementing
the standard library methods.
> count(Tok,List) -> i_count(Tok,List,0).
> i_count(Tok,List,Acc) ->
> if Tok == hd(List) andalso List /= [] ->
> i_count(Tok,tl(List),Acc+1);
> Tok /= hd(List) andalso List /= [] ->
> i_count(Tok,tl(List),Acc);
> true ->
> Acc
> end.
Consider using
length([T == Tok || T <- List])
which is much more concise and manageable.
> count_all(List) ->
> Unique = lists:usort(List),
> [{U, count(U, List)} || U <- Unique].
This one has O(N^2) complexity. Consider using ets if the data
set is larger than, say, 1000 elements.
> count_set_in_list(Set,List) ->
> S = [{S, count(S, List)} || S <- Set],
> lists:sum(lists:map(fun({H,T}) -> T end, S)).
count_set_in_list(Set, List) ->
lists:sum([count(S, List) || S <- Set]).
> most_common(Stringlist,Xmost) ->
> No_preps = lists:filter(fun(X) -> length(X) > 4 end, Stringlist),
> Sorted_by_count = lists:keysort(2, count_all(No_preps)),
> TakeX = take(Xmost, lists:reverse(Sorted_by_count)),
> lists:map(fun({H,T}) -> H end, TakeX).
> readfile(FileName) ->
> {ok, Binary} = file:read_file(FileName),
> string:tokens(binary_to_list(Binary), " ").
>
> classify() ->
> GoodWords =
> most_common(readfile("C:/Users/saftarn/Desktop/emails/okemails.txt"), 20),
> BadWords =
> most_common(readfile("C:/Users/saftarn/Desktop/emails/spam.txt"), 20),
> GoodCount = count_set_in_list(GoodWords,
> readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
> BadCount = count_set_in_list(BadWords,
> readfile("C:/Users/saftarn/Desktop/emails/test.txt")),
> T = BadCount + GoodCount,
> if T /= 0 ->
> BadCount / T;
> true ->
> 0.5
> end.
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> erlang-questions mailing list
> erlang-questions@REDACTED
> http://www.erlang.org/mailman/listinfo/erlang-questions
More information about the erlang-questions
mailing list