call for a lightweight xml parser
Bjorn Gustavsson
bjorn@REDACTED
Wed Oct 19 11:00:07 CEST 2005
Why not try to include a faster parser in the xmerl application?
xmerl could by default use a fast, slim-lined parser, and as soon
as it encounters something it can't handle, it can revert to the
slower but more complete parser.
I think that would be a much better from the user's point of view,
than to introdue Yet Another XML parser for Erlang.
/Björn
"Ulf Wiger (AL/EAB)" <ulf.wiger@REDACTED> writes:
> I feel that I'm allowed to gripe a bit about xmerl, since I
> wrote the thing initially. (:
>
> xmerl has evolved into a mostly-complete xml parser
> that has quite a lot of hooks and gadgets -- good for
> tools that somehow operate on XML without realtime
> requirements (e.g. edoc et al).
>
> Leaving that aside for a moment, my own current need
> is to have a lightweight xml parser that handles reasonable
> XML chunks in a message-passing protocol.
>
> I wrote a small XML parser. It's good enough for what I
> want right now, so I'm probably going to leave it alone.
> I timed it on a small XML-like string (no header, and no
> DTD stuff):
>
> 18> timer:tc(vccXml,simple_xml,["<protocol name=\"vccBasic\" vsn=\"1.0\"/>"]).
> {20,{simple_xml,[{"protocol",[{"vsn","1.0"},{"name","vccBasic"}],[]}]}}
>
> Running the same string through xmerl:
>
> 15> timer:tc(xmerl_scan,string,["<protocol value=\"vccBasic\"/>"]).
> {410,
> {{xmlElement,protocol,
> protocol,
> [],
> {xmlNamespace,[],[]},
> [],
> 1,
> [{xmlAttribute,value,[],[],[],[],1,[],"vccBasic",false}],
> [],
> [],
> "/home/etxuwig/work/erlang/vcc-0.36",
> undeclared},
> []}}
>
> Personally, I'd rather write my own incomplete parser than
> pay half a millisecond just to parse a string of 37 characters...
>
> Perhaps xmerl could include an 'xmerl_lite' parser that handles
> the type of XML that you're most likely to encounter in a packet,
> no entity refs, no strange encodings, etc.?
>
> I've included my own light parser below. It's limited in that it
> doesn't strictly do one-character at a time(*), and it should perhaps
> handle some more stuff (e.g. the <?xml ...?> form) in order to be
> sufficiently generic. See it as a suggestion.
>
> (*) I don't need that at the moment, and my current objective is not to
> write an XML parser. I run {packet,4} semantics, and will always
> get the complete string.
>
> Someone else may have a more complete, and equally lightweight
> parser to contribute?
>
> /Uffe
>
> ===================================
>
> -module(vccXml).
>
> -export([simple_xml/1]).
>
> -define(WHITESPACE(H), H==$\s; H==$\r; H==$\n; H==$\t).
> -define(bad_xml(T), erlang:error({bad_xml, string:substr(T, 1, 5)})).
>
> simple_xml(Bin) when is_binary(Bin) ->
> {simple_xml, simple_xml(strip(binary_to_list(Bin)), [])};
> simple_xml(Str) ->
> {simple_xml, simple_xml(strip(Str), [])}.
>
> simple_xml("<" ++ Str, Acc) ->
> {Str1, Acc1} = xml_tag(strip(Str), [], Acc),
> simple_xml(Str1, Acc1);
> simple_xml([], Acc) ->
> lists:reverse(Acc).
>
> xml_tag("/>" ++ T, TagAcc, Acc) ->
> {strip(T), [{lists:reverse(TagAcc), [], []}|Acc]};
> xml_tag(">" ++ Str, TagAcc, Acc) ->
> xml_content(strip(Str), lists:reverse(TagAcc), [], [], Acc);
> xml_tag([H|T], TagAcc, Acc) when ?WHITESPACE(H) ->
> xml_attributes(strip(T), [], [], lists:reverse(TagAcc), Acc);
> xml_tag([H|T], TagAcc, Acc) ->
> xml_tag(T, [H|TagAcc], Acc).
>
>
> xml_attributes("=" ++ T, TagAcc, AAcc, Tag, Acc) ->
> xml_attr_value(strip(T), lists:reverse(TagAcc), AAcc, Tag, Acc);
> xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) when ?WHITESPACE(H) ->
> case strip(T) of
> "=" ++ T1 ->
> xml_attr_value(strip(T1), lists:reverse(TagAcc), AAcc, Tag, Acc);
> _ ->
> ?bad_xml(T)
> end;
> xml_attributes([H|T], TagAcc, AAcc, Tag, Acc) ->
> xml_attributes(T, [H|TagAcc], AAcc, Tag, Acc).
>
> xml_attr_value("\"" ++ T1, ATag, AAcc, Tag, Acc) ->
> {Str, T2} = scan_string(T1, []),
> case strip(T2) of
> "/>" ++ T3 ->
> {strip(T3), [{Tag, [{ATag, Str}|AAcc], []}|Acc]};
> ">" ++ T3 ->
> xml_content(
> strip(T3), Tag, [], lists:reverse(
> [{ATag, Str}|AAcc]), Acc);
> T3 ->
> xml_attributes(T3, [], [{ATag, Str}|AAcc], Tag, Acc)
> end;
> xml_attr_value(T, _, _, _, _) ->
> ?bad_xml(T).
>
>
>
>
> xml_content("</" ++ Str, Tag, CAcc, Attrs, Acc) ->
> Str1 = strip_prefix(Tag ++ ">", Str),
> {Str1, [{Tag, Attrs, lists:reverse(CAcc)}|Acc]};
> xml_content("<" ++ Str, Tag, CAcc, Attrs, Acc) ->
> {Str1, CAcc1} = xml_tag(Str, [], CAcc),
> xml_content(Str1, Tag, CAcc1, Attrs, Acc);
> xml_content([H|T], Tag, CAcc, Attrs, Acc) ->
> xml_text(T, [H], Tag, CAcc, Attrs, Acc).
>
> xml_text("<" ++ _ = Str, TAcc, Tag, CAcc, Attrs, Acc) ->
> xml_content(Str, Tag, [{text, lists:reverse(strip(TAcc))}|CAcc],
> Attrs, Acc);
> xml_text([H|T], TAcc, Tag, CAcc, Attrs, Acc) ->
> xml_text(T, [H|TAcc], Tag, CAcc, Attrs, Acc).
>
> strip_prefix([H|T1], [H|T2]) ->
> strip_prefix(T1, T2);
> strip_prefix([], T) ->
> T;
> strip_prefix(_, T) ->
> ?bad_xml(T).
>
>
> strip([H|T]) when ?WHITESPACE(H) ->
> strip(T);
> strip(Str) ->
> Str.
>
>
> scan_string("\"" ++ T, Acc) ->
> {lists:reverse(Acc), T};
> scan_string([H|T], Acc) ->
> scan_string(T, [H|Acc]);
> scan_string([], Acc) ->
> ?bad_xml(lists:reverse(Acc)).
>
>
--
Björn Gustavsson, Erlang/OTP, Ericsson AB
More information about the erlang-questions
mailing list