[erlang-questions] built-in xml parser

Per Melin per.melin@REDACTED
Thu Jun 25 21:14:15 CEST 2009


Joel Reymont:
> I used to avoid regular expressions but then the new 're' module became part
> of OTP.
>
> I'm now using regular expressions with abandon!
>
> Is there a chance that future versions of OTP come with a built-in XML
> parser based on a C library, just like 're'?

The solution is obvious; use 're' to parse XML.

-module(regexml).

-export([parse/1]).

-define(XML_RE, "[^<]+|<(?:!(?:--(?:[^-]*-(?:[^-][^-]*-)*->?)?"
                "|\\[CDATA\\[(?:[^\\]]*](?:[^\\]]+])*]+"
                "(?:[^\\]>][^\\]]*](?:[^\\]]+])*]+)*>)?"
                "|DOCTYPE(?:[ \\n\\t\\r]+(?:[A-Za-z_:]"
                "|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+(?:(?:[A-Za-z_:]"
                "|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*|\"[^\"]*\""
                "|'[^']*'))*(?:[ \\n\\t\\r]+)?"
                "(?:\\[(?:<(?:!(?:--[^-]*-(?:[^-][^-]*-)*->"
                "|[^-](?:[^\\]\"'><]+|\"[^\"]*\"|'[^']*')*>)"
                "|\\?(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*(?:\\?>"
                "|[\\n\\r\\t ][^?]*\\?+(?:[^>?][^?]*\\?+)*>))"
                "|%(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*;|[ \\n\\t\\r]+)*](?:[ \\n\\t\\r]+)?)?>?)?)?"
                "|\\?(?:(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*(?:\\?>"
                "|[\\n\\r\\t ][^?]*\\?+(?:[^>?][^?]*\\?+)*>)?)?"
                "|/(?:(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+)?>?)?|(?:(?:[A-Za-z_:]"
                "|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+(?:[A-Za-z_:]"
                "|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
                "|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+)?="
                "(?:[ \\n\\t\\r]+)?(?:\"[^<\"]*\""
                "|'[^<']*'))*(?:[ \\n\\t\\r]+)?/?>?)?)").

parse(String) ->
    re:run(String, ?XML_RE, [{capture, all, list}, global]).

---

Adapted from http://www.cs.sfu.ca/~cameron/REX.html ("XML Shallow
Parsing with Regular Expressions").


More information about the erlang-questions mailing list