%%% Convert Erlang mailing list HTML archives to mbox format
%%% Use 'wget' to fetch the raw archives
-module(ml2mb).
-export([go/0, message/1]).
go() -> ok.
message(Name) ->
{ok, F} = file:open(Name, [read]),
Header = header(F),
Body = body(F),
MBox_start = "From bogus@mail.address Sat Jan 3 01:05:34 1996\n",
Bin = list_to_binary([MBox_start, Header, Body]),
file:write_file(Name ++ ".mbox", Bin).
header(File) ->
Raw = between_tags(File,
"",
""),
dehtml(lists:flatten(Raw)).
body(File) ->
between_tags(File,
"",
"").
between_tags(File, Start, End) ->
TS = Start ++ "\n",
case io:get_line(File, "") of
eof ->
"";
TS ->
to_end_tag(File, End);
_ ->
between_tags(File, Start, End)
end.
to_end_tag(File, End) ->
TE = End ++ "\n",
case io:get_line(File, "") of
eof ->
"";
TE ->
[];
Line ->
[Line|to_end_tag(File, End)]
end.
dehtml(String) ->
{ok, Result, _Changes} = regexp:gsub(String, "<[^>]+>", ""),
Result.