call for a lightweight xml parser
Ulf Wiger (AL/EAB)
ulf.wiger@REDACTED
Tue Oct 18 10:15:41 CEST 2005
I feel that I'm allowed to gripe a bit about xmerl, since I
wrote the thing initially. (:
xmerl has evolved into a mostly-complete xml parser
that has quite a lot of hooks and gadgets -- good for
tools that somehow operate on XML without realtime
requirements (e.g. edoc et al).
Leaving that aside for a moment, my own current need
is to have a lightweight xml parser that handles reasonable
XML chunks in a message-passing protocol.
I wrote a small XML parser. It's good enough for what I
want right now, so I'm probably going to leave it alone.
I timed it on a small XML-like string (no header, and no
DTD stuff):
18> timer:tc(vccXml,simple_xml,["<protocol name=\"vccBasic\" vsn=\"1.0\"/>"]).
{20,{simple_xml,[{"protocol",[{"vsn","1.0"},{"name","vccBasic"}],[]}]}}
Running the same string through xmerl:
15> timer:tc(xmerl_scan,string,["<protocol value=\"vccBasic\"/>"]).
{410,
{{xmlElement,protocol,
protocol,
[],
{xmlNamespace,[],[]},
[],
1,
[{xmlAttribute,value,[],[],[],[],1,[],"vccBasic",false}],
[],
[],
"/home/etxuwig/work/erlang/vcc-0.36",
undeclared},
[]}}
Personally, I'd rather write my own incomplete parser than
pay half a millisecond just to parse a string of 37 characters...
Perhaps xmerl could include an 'xmerl_lite' parser that handles
the type of XML that you're most likely to encounter in a packet,
no entity refs, no strange encodings, etc.?
I've included my own light parser below. It's limited in that it
doesn't strictly do one-character at a time(*), and it should perhaps
handle some more stuff (e.g. the <?xml ...?> form) in order to be
sufficiently generic. See it as a suggestion.
(*) I don't need that at the moment, and my current objective is not to
write an XML parser. I run {packet,4} semantics, and will always
get the complete string.
Someone else may have a more complete, and equally lightweight
parser to contribute?
/Uffe
===================================
-module(vccXml).
-export([simple_xml/1]).
-define(WHITESPACE(H), H==$\s; H==$\r; H==$\n; H==$\t).
-define(bad_xml(T), erlang:error({bad_xml, string:substr(T, 1, 5)})).
simple_xml(Bin) when is_binary(Bin) ->
{simple_xml, simple_xml(strip(binary_to_list(Bin)), [])};
simple_xml(Str) ->
{simple_xml, simple_xml(strip(Str), [])}.
simple_xml("<" ++ Str, Acc) ->
{Str1, Acc1} = xml_tag(strip(Str), [], Acc),
simple_xml(Str1, Acc1);
simple_xml([], Acc) ->
lists:reverse(Acc).
xml_tag("/>" ++ T, TagAcc, Acc) ->
{strip(T), [{lists:reverse(TagAcc), [], []}|Acc]};
xml_tag(">" ++ Str, TagAcc, Acc) ->
xml_content(strip(Str), lists:reverse(TagAcc), [], [], Acc);
xml_tag([H|T], TagAcc, Acc) when ?WHITESPACE(H) ->
xml_attributes(strip(T), [], [], lists:reverse(TagAcc), Acc);
xml_tag([H|T], TagAcc, Acc) ->
xml_tag(T, [H|TagAcc], Acc).
xml_attributes("=" ++ T, TagAcc, AAcc, Tag, Acc) ->
xml_attr_value(strip(T), lists:reverse(TagAcc), AAcc, Tag, Acc);
xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) when ?WHITESPACE(H) ->
case strip(T) of
"=" ++ T1 ->
xml_attr_value(strip(T1), lists:reverse(TagAcc), AAcc, Tag, Acc);
_ ->
?bad_xml(T)
end;
xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) ->
xml_attributes(T, [H|TagAcc], AAcc, Tag, Acc).
xml_attr_value("\"" ++ T1, ATag, AAcc, Tag, Acc) ->
{Str, T2} = scan_string(T1, []),
case strip(T2) of
"/>" ++ T3 ->
{strip(T3), [{Tag, [{ATag, Str}|AAcc], []}|Acc]};
">" ++ T3 ->
xml_content(
strip(T3), Tag, [], lists:reverse(
[{ATag, Str}|AAcc]), Acc);
T3 ->
xml_attributes(T3, [], [{ATag, Str}|AAcc], Tag, Acc)
end;
xml_attr_value(T, _, _, _, _) ->
?bad_xml(T).
xml_content("</" ++ Str, Tag, CAcc, Attrs, Acc) ->
Str1 = strip_prefix(Tag ++ ">", Str),
{Str1, [{Tag, Attrs, lists:reverse(CAcc)}|Acc]};
xml_content("<" ++ Str, Tag, CAcc, Attrs, Acc) ->
{Str1, CAcc1} = xml_tag(Str, [], CAcc),
xml_content(Str1, Tag, CAcc1, Attrs, Acc);
xml_content([H|T], Tag, CAcc, Attrs, Acc) ->
xml_text(T, [H], Tag, CAcc, Attrs, Acc).
xml_text("<" ++ _ = Str, TAcc, Tag, CAcc, Attrs, Acc) ->
xml_content(Str, Tag, [{text, lists:reverse(strip(TAcc))}|CAcc],
Attrs, Acc);
xml_text([H|T], TAcc, Tag, CAcc, Attrs, Acc) ->
xml_text(T, [H|TAcc], Tag, CAcc, Attrs, Acc).
strip_prefix([H|T1], [H|T2]) ->
strip_prefix(T1, T2);
strip_prefix([], T) ->
T;
strip_prefix(_, T) ->
?bad_xml(T).
strip([H|T]) when ?WHITESPACE(H) ->
strip(T);
strip(Str) ->
Str.
scan_string("\"" ++ T, Acc) ->
{lists:reverse(Acc), T};
scan_string([H|T], Acc) ->
scan_string(T, [H|Acc]);
scan_string([], Acc) ->
?bad_xml(lists:reverse(Acc)).
More information about the erlang-questions
mailing list