[erlang-questions] built-in xml parser
Per Melin
per.melin@REDACTED
Thu Jun 25 21:14:15 CEST 2009
Joel Reymont:
> I used to avoid regular expressions but then the new 're' module became part
> of OTP.
>
> I'm now using regular expressions with abandon!
>
> Is there a chance that future versions of OTP come with a built-in XML
> parser based on a C library, just like 're'?
The solution is obvious; use 're' to parse XML.
-module(regexml).
-export([parse/1]).
-define(XML_RE, "[^<]+|<(?:!(?:--(?:[^-]*-(?:[^-][^-]*-)*->?)?"
"|\\[CDATA\\[(?:[^\\]]*](?:[^\\]]+])*]+"
"(?:[^\\]>][^\\]]*](?:[^\\]]+])*]+)*>)?"
"|DOCTYPE(?:[ \\n\\t\\r]+(?:[A-Za-z_:]"
"|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+(?:(?:[A-Za-z_:]"
"|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*|\"[^\"]*\""
"|'[^']*'))*(?:[ \\n\\t\\r]+)?"
"(?:\\[(?:<(?:!(?:--[^-]*-(?:[^-][^-]*-)*->"
"|[^-](?:[^\\]\"'><]+|\"[^\"]*\"|'[^']*')*>)"
"|\\?(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*(?:\\?>"
"|[\\n\\r\\t ][^?]*\\?+(?:[^>?][^?]*\\?+)*>))"
"|%(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*;|[ \\n\\t\\r]+)*](?:[ \\n\\t\\r]+)?)?>?)?)?"
"|\\?(?:(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*(?:\\?>"
"|[\\n\\r\\t ][^?]*\\?+(?:[^>?][^?]*\\?+)*>)?)?"
"|/(?:(?:[A-Za-z_:]|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+)?>?)?|(?:(?:[A-Za-z_:]"
"|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+(?:[A-Za-z_:]"
"|[^\\x00-\\x7F])(?:[A-Za-z0-9_:.-]"
"|[^\\x00-\\x7F])*(?:[ \\n\\t\\r]+)?="
"(?:[ \\n\\t\\r]+)?(?:\"[^<\"]*\""
"|'[^<']*'))*(?:[ \\n\\t\\r]+)?/?>?)?)").
parse(String) ->
re:run(String, ?XML_RE, [{capture, all, list}, global]).
---
Adapted from http://www.cs.sfu.ca/~cameron/REX.html ("XML Shallow
Parsing with Regular Expressions").
More information about the erlang-questions
mailing list