call for a lightweight xml parser

Wed Oct 19 11:00:07 CEST 2005

Why not try to include a faster parser in the xmerl application?

xmerl could by default use a fast, slim-lined parser, and as soon
as it encounters something it can't handle, it can revert to the
slower but more complete parser.

I think that would be a much better from the user's point of view,
than to introdue Yet Another XML parser for Erlang.

/Björn

"Ulf Wiger (AL/EAB)" <ulf.wiger@REDACTED> writes:

> I feel that I'm allowed to gripe a bit about xmerl, since I 
> wrote the thing initially.  (:
> 
> xmerl has evolved into a mostly-complete xml parser
> that has quite a lot of hooks and gadgets -- good for 
> tools that somehow operate on XML without realtime
> requirements (e.g. edoc et al). 
> 
> Leaving that aside for a moment, my own current need 
> is to have a lightweight xml parser that handles reasonable
> XML chunks in a message-passing protocol.
> 
> I wrote a small XML parser. It's good enough for what I 
> want right now, so I'm probably going to leave it alone.
> I timed it on a small XML-like string (no header, and no
> DTD stuff):
> 
> 18> timer:tc(vccXml,simple_xml,["<protocol name=\"vccBasic\" vsn=\"1.0\"/>"]).
> {20,{simple_xml,[{"protocol",[{"vsn","1.0"},{"name","vccBasic"}],[]}]}}
> 
> Running the same string through xmerl:
> 
> 15> timer:tc(xmerl_scan,string,["<protocol value=\"vccBasic\"/>"]).
> {410,
>  {{xmlElement,protocol,
>               protocol,
>               [],
>               {xmlNamespace,[],[]},
>               [],
>               1,
>               [{xmlAttribute,value,[],[],[],[],1,[],"vccBasic",false}],
>               [],
>               [],
>               "/home/etxuwig/work/erlang/vcc-0.36",
>               undeclared},
>   []}}
> 
> Personally, I'd rather write my own incomplete parser than 
> pay half a millisecond just to parse a string of 37 characters...
> 
> Perhaps xmerl could include an 'xmerl_lite' parser that handles
> the type of XML that you're most likely to encounter in a packet,
> no entity refs, no strange encodings, etc.?
> 
> I've included my own light parser below. It's limited in that it 
> doesn't strictly do one-character at a time(*), and it should perhaps 
> handle some more stuff (e.g. the <?xml ...?> form) in order to be 
> sufficiently generic. See it as a suggestion.
> 
> (*) I don't need that at the moment, and my current objective is not to 
> write an XML parser. I run {packet,4} semantics, and will always
> get the complete string.
> 
> Someone else may have a more complete, and equally lightweight
> parser to contribute?
> 
> /Uffe
> 
> ===================================
> 
> -module(vccXml).
> 
> -export([simple_xml/1]).
> 
> -define(WHITESPACE(H), H==$\s; H==$\r; H==$\n; H==$\t).
> -define(bad_xml(T), erlang:error({bad_xml, string:substr(T, 1, 5)})).
> 
> simple_xml(Bin) when is_binary(Bin) ->
>     {simple_xml, simple_xml(strip(binary_to_list(Bin)), [])};
> simple_xml(Str) ->
>     {simple_xml, simple_xml(strip(Str), [])}.
> 
> simple_xml("<" ++ Str, Acc) ->
>     {Str1, Acc1} = xml_tag(strip(Str), [], Acc),
>     simple_xml(Str1, Acc1);
> simple_xml([], Acc) ->
>     lists:reverse(Acc).
> 
> xml_tag("/>" ++ T, TagAcc, Acc) ->
>     {strip(T), [{lists:reverse(TagAcc), [], []}|Acc]};
> xml_tag(">" ++ Str, TagAcc, Acc) ->
>     xml_content(strip(Str), lists:reverse(TagAcc), [], [], Acc);
> xml_tag([H|T], TagAcc, Acc) when ?WHITESPACE(H) ->
>     xml_attributes(strip(T), [], [], lists:reverse(TagAcc), Acc);
> xml_tag([H|T], TagAcc, Acc) ->
>     xml_tag(T, [H|TagAcc], Acc).
> 
> 
> xml_attributes("=" ++ T, TagAcc, AAcc, Tag, Acc) ->
>     xml_attr_value(strip(T), lists:reverse(TagAcc), AAcc, Tag, Acc);
> xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) when ?WHITESPACE(H) ->
>     case strip(T) of
> 	"=" ++ T1 ->
> 	    xml_attr_value(strip(T1), lists:reverse(TagAcc), AAcc, Tag, Acc);
> 	_ ->
> 	    ?bad_xml(T)
>     end;
> xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) ->
>     xml_attributes(T, [H|TagAcc], AAcc, Tag, Acc).
> 
> xml_attr_value("\"" ++ T1, ATag, AAcc, Tag, Acc) ->
>     {Str, T2} = scan_string(T1, []),
>     case strip(T2) of
> 	"/>" ++ T3 ->
> 	    {strip(T3), [{Tag, [{ATag, Str}|AAcc], []}|Acc]};
> 	">" ++ T3 ->
> 	    xml_content(
> 	      strip(T3), Tag, [], lists:reverse(
> 				    [{ATag, Str}|AAcc]), Acc);
> 	T3 ->
> 	    xml_attributes(T3, [], [{ATag, Str}|AAcc], Tag, Acc)
>     end;
> xml_attr_value(T, _, _, _, _) ->
>     ?bad_xml(T).
> 
> 
> 
> 
> xml_content("</" ++ Str, Tag, CAcc, Attrs, Acc) ->
>     Str1 = strip_prefix(Tag ++ ">", Str),
>     {Str1, [{Tag, Attrs, lists:reverse(CAcc)}|Acc]};
> xml_content("<" ++ Str, Tag, CAcc, Attrs, Acc) ->
>     {Str1, CAcc1} = xml_tag(Str, [], CAcc),
>     xml_content(Str1, Tag, CAcc1, Attrs, Acc);
> xml_content([H|T], Tag, CAcc, Attrs, Acc) ->
>     xml_text(T, [H], Tag, CAcc, Attrs, Acc).
> 
> xml_text("<" ++ _ = Str, TAcc, Tag, CAcc, Attrs, Acc) ->
>     xml_content(Str, Tag, [{text, lists:reverse(strip(TAcc))}|CAcc],
> 		Attrs, Acc);
> xml_text([H|T], TAcc, Tag, CAcc, Attrs, Acc) ->
>     xml_text(T, [H|TAcc], Tag, CAcc, Attrs, Acc).
> 
> strip_prefix([H|T1], [H|T2]) ->
>     strip_prefix(T1, T2);
> strip_prefix([], T) ->
>     T;
> strip_prefix(_, T) ->
>     ?bad_xml(T).
> 
> 
> strip([H|T]) when ?WHITESPACE(H) ->
>     strip(T);
> strip(Str) ->
>     Str.
>     
> 
> scan_string("\"" ++ T, Acc) ->
>     {lists:reverse(Acc), T};
> scan_string([H|T], Acc) ->
>     scan_string(T, [H|Acc]);
> scan_string([], Acc) ->
>     ?bad_xml(lists:reverse(Acc)).
> 
> 

-- 
Björn Gustavsson, Erlang/OTP, Ericsson AB