--- www_tools-1.0/url_parse.erl 1997-03-14 10:00:22.000000000 +0100 +++ www_tools-1.0/url_parse.erl-new 2005-03-14 17:29:22.533120792 +0100 @@ -13,6 +13,48 @@ -import(lists, [reverse/1, member/2]). + +%% _____________________________________________________________ +% Merge a Url with a relative +merge([], Rel) -> + "/" ++ Rel; + +merge([$/|Name], [$/|Rel]) -> + merge(Name, Rel); + +merge(Name, [$/|Rel]) -> + merge(Name, Rel); + +merge([$/|Name], Rel) -> + merge(Name, Rel); + +merge(Name, Rel) -> + case lists:last(Name) of + $/ -> "/" ++ Name ++ Rel; + _ -> "/" ++ Name ++ "/" ++ Rel + end. + + +%% _____________________________________________________________ +% Go to the Upper directory +upDir(Name) -> + case member($/, Name) of + true -> + {Dir, File} = split_dir(Name), + case member($., File) of + true -> + upDir(Dir); + false -> + Dir + end; + false -> + case member($., Name) of + true -> + ""; + false -> + Name + end + end. %%---------------------------------------------------------------------- %% parse(URL) -> {http, Site, Port, File} | %% {file, File} | {error,Why} @@ -55,6 +97,22 @@ %% heurstics: %% If tail of file has a "." in it + +resolve(Root, [$.,$/|T]) -> + resolve(Root, T); + +resolve(Root, [$.,$.,$/|T]) -> + %% The relative bit is absolute + %% the easy case + case parse(Root) of + {http, Site, Port, File} -> + resolve("http://" ++ Site ++ port_str(Port) ++ "/" ++ upDir(File), T); + {file, _} -> + "file://" ++ [$.,$.,$/|T]; + Other -> + Root ++ [$.,$.,$/|T] + end; + resolve(Root, [$/|T]) -> %% The relative bit is absolute %% the easy case @@ -66,12 +124,12 @@ Other -> Root ++ [$/|T] end; + resolve(Root, Rel) -> %% The Rel bit is relative so we need to parse the root case parse(Root) of {http, Site, Port, File} -> - "http://" ++ Site ++ port_str(Port) - ++ rootDir(File) ++ "/" ++ Rel; + "http://" ++ Site ++ port_str(Port) ++ merge(rootDir(File), Rel); {file, File} -> "file://" ++ rootDir(File) ++ "/" ++ Rel; Other -> @@ -88,6 +146,10 @@ %% /a/b/c => "/a/b/c" %% a => "a" + + +%% _____________________________________________________________ + rootDir(Name) -> case member($/, Name) of true -> @@ -96,7 +158,7 @@ true -> Dir; false -> - Name + Name end; false -> case member($., Name) of --- www_tools-1.0/html_tokenise.erl 1997-04-30 11:54:38.000000000 +0200 +++ www_tools-1.0/html_tokenise.erl-new 2005-03-14 17:24:50.186523768 +0100 @@ -303,13 +303,18 @@ amp_digits([X | Xs], N) when X >= $0, X =< $9 -> amp_digits(Xs, N*10 + (X-$0)); + amp_digits([], N) -> if N >= 0, N =< 8 -> error; N >= 127, N =< 159 -> error; N > 255 -> error; true -> N - end. + end; + + amp_digits(_, Y) -> % function_clause,[{html_tokenise,amp_digits,["xa9",0]}] + amp_digits([], Y). + dump_token(O, {raw, R}) -> --- www_tools-1.0/html_analyse.erl 1997-04-05 14:38:27.000000000 +0200 +++ www_tools-1.0/html_analyse.erl-new 2005-03-14 17:26:22.529485504 +0100 @@ -9,17 +9,46 @@ -compile(export_all). --import(lists, [member/2, foldl/3]). +-import(lists, [member/2, foldl/3, flatten/1]). +-import(string, [substr/3, str/2, len/1, rchr/2, chr/2]). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +body(Bin) -> + Toks = html_tokenise:bin2toks(Bin), + analyse(Toks). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% file(File) -> Toks = html_tokenise:file2toks(File), analyse(Toks). +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + analyse(Toks) -> - Hrefs = [H || {tagStart, "a", L} <- Toks, {"href", H} <- L], - Images1 = [S || {tagStart, "img", L} <- Toks, {"src", S} <- L], - Images2 = [S || {tagStart, "body", L} <- Toks, {"background", S} <- L], - {remove_duplicates(Hrefs), remove_duplicates(Images1++Images2)}. + % Extract HREF + Hrefs = [H || {tagStart, "a", L} <- Toks, {"href", H} <- L, str(H, + "javascript") == 0, str(H, "mailto") == 0, str(H, "script") == 0 ], + + % Extract SRC + Imgages1 = [S || {tagStart, "img", L} <- Toks, {"src", S} <- L, str(S, + "javascript") == 0, str(S, "mailto") == 0, str(S, "script") == 0 ], + + % Extract AREA + Area = [A || {tagStart, "area", L} <- Toks, {"href", A} <- L, str(A, + "javascript") == 0, str(A, "mailto") == 0, str(A, "script") == 0 ], + + % Extract backgound + Images2 = [B || {tagStart, "body", L} <- Toks, {"background", B} <- L], + + + remove_duplicates( remove_quotes( remove_anchors(Hrefs++Area++Images1++Images2) ) +). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% remove_duplicates(L) -> foldl(fun(X, Acc) -> @@ -30,7 +59,62 @@ end, [], L). +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +remove_anchors(L) -> + remove_anchors(L, []). +remove_anchors([], NL) -> NL; +remove_anchors([H|L], NL) -> + case str(H, "#") of + 0 -> remove_anchors(L, [H|NL]); + Other -> + remove_anchors(L, [substr(H, 1, taille(len(H), Other - 1))|NL]) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +taille(A, B) when B < 0 -> A; +taille(_, B) -> B. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +remove_quotes(L) -> + remove_quotes(L, []). + +remove_quotes([], NL) -> NL; +remove_quotes([H|L], NL) -> + case chr(H, $') of + 1 -> case len(H) == rchr(H, $') of + NoQuotes -> remove_quotes(L, NL); + 1 -> remove_quotes(L, [substr(H, 2, len(H) - 1)|NL]) + end; + IndexQuote -> remove_quotes(L, [H|NL]) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +resolve_url(Root, L) -> + resolve_url(Root, L, []). + +resolve_url(Root, [], NL) -> + NL; + +resolve_url(Root, [H|L], NL) -> + case str(H, Root) of + 0 -> case str(H, "://") of + 0 -> resolve_url(Root, L, [url_parse:resolve(Root, H)|NL]); + IndexProto -> resolve_url(Root, L, NL) + end; + 1 -> resolve_url(Root, L, [H|NL]) + end. + +% resolve_url(Root, [H|L], NL) -> +% case lists:prefix(Root, H) of +% false -> case lists:prefix("://", H) of +% false -> resolve_url(Root, L, [url_parse:resolve(Root, H)|NL]); +% true -> resolve_url(Root, L, NL) +% end; +% true -> resolve_url(Root, L, [H|NL]) +% end. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -