call for a lightweight xml parser

Tue Oct 18 10:15:41 CEST 2005

I feel that I'm allowed to gripe a bit about xmerl, since I 
wrote the thing initially.  (:

xmerl has evolved into a mostly-complete xml parser
that has quite a lot of hooks and gadgets -- good for 
tools that somehow operate on XML without realtime
requirements (e.g. edoc et al). 

Leaving that aside for a moment, my own current need 
is to have a lightweight xml parser that handles reasonable
XML chunks in a message-passing protocol.

I wrote a small XML parser. It's good enough for what I 
want right now, so I'm probably going to leave it alone.
I timed it on a small XML-like string (no header, and no
DTD stuff):

18> timer:tc(vccXml,simple_xml,["<protocol name=\"vccBasic\" vsn=\"1.0\"/>"]).
{20,{simple_xml,[{"protocol",[{"vsn","1.0"},{"name","vccBasic"}],[]}]}}

Running the same string through xmerl:

15> timer:tc(xmerl_scan,string,["<protocol value=\"vccBasic\"/>"]).
{410,
 {{xmlElement,protocol,
              protocol,
              [],
              {xmlNamespace,[],[]},
              [],
              1,
              [{xmlAttribute,value,[],[],[],[],1,[],"vccBasic",false}],
              [],
              [],
              "/home/etxuwig/work/erlang/vcc-0.36",
              undeclared},
  []}}

Personally, I'd rather write my own incomplete parser than 
pay half a millisecond just to parse a string of 37 characters...

Perhaps xmerl could include an 'xmerl_lite' parser that handles
the type of XML that you're most likely to encounter in a packet,
no entity refs, no strange encodings, etc.?

I've included my own light parser below. It's limited in that it 
doesn't strictly do one-character at a time(*), and it should perhaps 
handle some more stuff (e.g. the <?xml ...?> form) in order to be 
sufficiently generic. See it as a suggestion.

(*) I don't need that at the moment, and my current objective is not to 
write an XML parser. I run {packet,4} semantics, and will always
get the complete string.

Someone else may have a more complete, and equally lightweight
parser to contribute?

/Uffe

===================================

-module(vccXml).

-export([simple_xml/1]).

-define(WHITESPACE(H), H==$\s; H==$\r; H==$\n; H==$\t).
-define(bad_xml(T), erlang:error({bad_xml, string:substr(T, 1, 5)})).

simple_xml(Bin) when is_binary(Bin) ->
    {simple_xml, simple_xml(strip(binary_to_list(Bin)), [])};
simple_xml(Str) ->
    {simple_xml, simple_xml(strip(Str), [])}.

simple_xml("<" ++ Str, Acc) ->
    {Str1, Acc1} = xml_tag(strip(Str), [], Acc),
    simple_xml(Str1, Acc1);
simple_xml([], Acc) ->
    lists:reverse(Acc).

xml_tag("/>" ++ T, TagAcc, Acc) ->
    {strip(T), [{lists:reverse(TagAcc), [], []}|Acc]};
xml_tag(">" ++ Str, TagAcc, Acc) ->
    xml_content(strip(Str), lists:reverse(TagAcc), [], [], Acc);
xml_tag([H|T], TagAcc, Acc) when ?WHITESPACE(H) ->
    xml_attributes(strip(T), [], [], lists:reverse(TagAcc), Acc);
xml_tag([H|T], TagAcc, Acc) ->
    xml_tag(T, [H|TagAcc], Acc).

xml_attributes("=" ++ T, TagAcc, AAcc, Tag, Acc) ->
    xml_attr_value(strip(T), lists:reverse(TagAcc), AAcc, Tag, Acc);
xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) when ?WHITESPACE(H) ->
    case strip(T) of
	"=" ++ T1 ->
	    xml_attr_value(strip(T1), lists:reverse(TagAcc), AAcc, Tag, Acc);
	_ ->
	    ?bad_xml(T)
    end;
xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) ->
    xml_attributes(T, [H|TagAcc], AAcc, Tag, Acc).

xml_attr_value("\"" ++ T1, ATag, AAcc, Tag, Acc) ->
    {Str, T2} = scan_string(T1, []),
    case strip(T2) of
	"/>" ++ T3 ->
	    {strip(T3), [{Tag, [{ATag, Str}|AAcc], []}|Acc]};
	">" ++ T3 ->
	    xml_content(
	      strip(T3), Tag, [], lists:reverse(
				    [{ATag, Str}|AAcc]), Acc);
	T3 ->
	    xml_attributes(T3, [], [{ATag, Str}|AAcc], Tag, Acc)
    end;
xml_attr_value(T, _, _, _, _) ->
    ?bad_xml(T).

xml_content("</" ++ Str, Tag, CAcc, Attrs, Acc) ->
    Str1 = strip_prefix(Tag ++ ">", Str),
    {Str1, [{Tag, Attrs, lists:reverse(CAcc)}|Acc]};
xml_content("<" ++ Str, Tag, CAcc, Attrs, Acc) ->
    {Str1, CAcc1} = xml_tag(Str, [], CAcc),
    xml_content(Str1, Tag, CAcc1, Attrs, Acc);
xml_content([H|T], Tag, CAcc, Attrs, Acc) ->
    xml_text(T, [H], Tag, CAcc, Attrs, Acc).

xml_text("<" ++ _ = Str, TAcc, Tag, CAcc, Attrs, Acc) ->
    xml_content(Str, Tag, [{text, lists:reverse(strip(TAcc))}|CAcc],
		Attrs, Acc);
xml_text([H|T], TAcc, Tag, CAcc, Attrs, Acc) ->
    xml_text(T, [H|TAcc], Tag, CAcc, Attrs, Acc).

strip_prefix([H|T1], [H|T2]) ->
    strip_prefix(T1, T2);
strip_prefix([], T) ->
    T;
strip_prefix(_, T) ->
    ?bad_xml(T).

strip([H|T]) when ?WHITESPACE(H) ->
    strip(T);
strip(Str) ->
    Str.

scan_string("\"" ++ T, Acc) ->
    {lists:reverse(Acc), T};
scan_string([H|T], Acc) ->
    scan_string(T, [H|Acc]);
scan_string([], Acc) ->
    ?bad_xml(lists:reverse(Acc)).