blob: 2cb28f50ba724e2011430ea0546d0e111b5a2e94 [file] [log] [blame]
# Copyright (C) 2010 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tiny XML parser implementation in awk.
#
# This file is not meant to be used directly, instead copy the
# functions it defines here into your own script then specialize
# it appropriately.
#
# See further below for usage instructions and implementation details.
#
# ---------------------------- cut here ---------------------------
function xml_event () {
RS=">";
XML_TAG=XML_TYPE="";
split("", XML_ATTR);
while ( 1 ) {
if (_xml_closing) { # delayed direct tag closure
XML_TAG = _xml_closing;
XML_TYPE = "END";
_xml_closing = "";
_xml_exit(XML_TAG);
return 1;
}
if (getline <= 0) return 0; # read new input line
_xml_p = index($0, "<"); # get start marker
if (_xml_p == 0) return 0; # end of file (or malformed input)
$0 = substr($0, _xml_p) # remove anything before '<'
# ignore CData / Comments / Processing instructions / Declarations
if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") ||
_xml_in_section("<!--", "--") ||
_xml_in_section("<\\?", "\\?") ||
_xml_in_section("<!", "")) {
continue;
}
if (substr($0, 1, 2) == "</") { # is it a closing tag ?
XML_TYPE = "END";
$0 = substr($0, 3);
} else { # nope, it's an opening one
XML_TYPE = "BEGIN";
$0 = substr($0, 2);
}
XML_TAG = $0
sub("[ \n\t/].*$", "", XML_TAG); # extract tag name
XML_TAG = toupper(XML_TAG); # uppercase it
if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) # validate it
_xml_panic("Invalid tag name: " XML_TAG);
if (XML_TYPE == "BEGIN") { # update reverse path
_xml_enter(XML_TAG);
} else {
_xml_exit(XML_TAG);
}
sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces
while ($0) { # process attributes
if ($0 == "/") { # deal with direct closing tag, e.g. </foo>
_xml_closing = XML_TAG; # record delayed tag closure.
break
}
_xml_attrib = $0;
sub(/=.*$/,"",_xml_attrib); # extract attribute name
sub(/^[^=]*/,"",$0); # remove it from record
_xml_attrib = tolower(_xml_attrib);
if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
_xml_panic("Invalid attribute name: " _xml_attrib);
if (substr($0,1,2) == "=\"") { # value is ="something"
_xml_value = substr($0,3);
sub(/".*$/,"",_xml_value);
sub(/^="[^"]*"/,"",$0);
} else if (substr($0,1,2) == "='") { # value is ='something'
_xml_value = substr($0,3);
sub(/'.*$/,"",_xml_value);
sub(/^='[^']*'/,"",$0);
} else {
_xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
}
XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value
sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces
}
return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
}
}
function _xml_panic (msg) {
print msg > "/dev/stderr"
exit(1)
}
function _xml_in_section (sec_begin, sec_end) {
if (!match( $0, "^" sec_begin )) return 0;
while (!match($0, sec_end "$")) {
if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
}
return 1;
}
function _xml_enter (tag) {
XML_RPATH = tag "/" XML_RPATH;
}
function _xml_exit (tag) {
_xml_p = index(XML_RPATH, "/");
_xml_expected = substr(XML_RPATH, 1, _xml_p-1);
if (_xml_expected != XML_TAG)
_xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
XML_RPATH = substr(XML_RPATH, _xml_p+1);
}
# ---------------------------- cut here ---------------------------
# USAGE:
#
# The functions provided here are used to extract the tags and attributes of a
# given XML file. They do not support extraction of data, CDATA, comments,
# processing instructions and declarations at all.
#
# You should use this from the BEGIN {} action of your awk script (it will
# not work from an END {} action).
#
# Call xml_event() in a while loop. This functions returns 1 for each XML
# 'event' encountered, or 0 when the end of input is reached. Note that in
# case of malformed output, an error will be printed and the script will
# force an exit(1)
#
# After each succesful xml_event() call, the following variables will be set:
#
# XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a
# closing one.
#
# XML_TAG: name of the tag, always in UPPERCASE!
#
# XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types.
# all attribute names are in lowercase.
#
# beware: values are *not* unescaped !
#
# XML_RPATH: the _reversed_ element path, using "/" as a separator.
# if you are within the <manifest><application> tag, then
# it will be set to "APPLICATION/MANIFEST/"
# (note the trailing slash).
#
# This is a simple example that dumps the output of the parsing.
#
BEGIN {
while ( xml_event() ) {
printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
if (XML_TYPE == "BEGIN") {
for (attr in XML_ATTR) {
printf " %s='%s'", attr, XML_ATTR[attr];
}
}
printf "\n";
}
}
# IMPLEMENTATION DETAILS:
#
# 1. '>' as the record separator:
#
# RS is set to '>' to use this character as the record separator, instead of
# the default '\n'. This means that something like the following:
#
# <foo><bar attrib="value">stuff</bar></foo>
#
# will be translated into the following successive 'records':
#
# <foo
# <bar attrib="value"
# stuff</bar
# </foo
#
# Note that the '>' is never part of the records and thus will not be matched.
# If the record does not contain a single '<', the input is either
# malformed XML, or we reached the end of file with data after the last
# '>'.
#
# Newlines in the original input are kept in the records as-is.
#
# 2. Getting rid of unwanted stuff:
#
# We don't need any of the data within elements, so we get rid of them by
# simply ignoring anything before the '<' in the current record. This is
# done with code like this:
#
# p = index($0, "<"); # get index of '<'
# if (p == 0) -> return 0; # malformed input or end of file
# $0 = substr($0, p+1); # remove anything before the '<' in record
#
# We also want to ignore certain sections like CDATA, comments, declarations,
# etc.. These begin with a certain pattern and end with another one, e.g.
# "<!--" and "-->" for comments. This is handled by the _xml_in_section()
# function that accepts two patterns as input:
#
# sec_begin: is the pattern for the start of the record.
# sec_end: is the pattern for the end of the record (minus trailing '>').
#
# The function deals with the fact that these section can embed a valid '>'
# and will then span multiple records, i.e. something like:
#
# <!-- A comment with an embedded > right here ! -->
#
# will be decomposed into two records:
#
# "<!-- A comment with an embedded "
# " right here ! --"
#
# The function deals with this case, and exits when such a section is not
# properly terminated in the input.
#
# _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
#
# 3. Extracting the tag name:
#
# </foo> is a closing tag, and <foo> an opening tag, this is handled
# by the following code:
#
# if (substr($0, 1, 2) == "</") {
# XML_TYPE = "END";
# $0 = substr($0, 3);
# } else {
# XML_TYPE = "BEGIN";
# $0 = substr($0, 2);
# }
#
# which defines XML_TYPE, and removes the leading "</" or "<" from the record.
# The tag is later extracted and converted to uppercase with:
#
# XML_TAG = $0 # copy record
# sub("[ \n\t/].*$", "", XML_TAG); # remove anything after tag name
# XML_TAG = toupper(XML_TAG); # conver to uppercase
# # validate tag
# if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
#
# Then the record is purged from the tag name and the spaces after it:
#
# # get rid of tag and spaces after it in $0
# sub("[^ \n\t]*[ \n\t]*", "", $0);
#
# 4. Maintaining XML_RPATH:
#
# The _xml_enter() and _xml_exit() functions are called to maintain the
# XML_RPATH variable when entering and exiting specific tags. _xml_exit()
# will also validate the input, checking proper tag enclosure (or exit(1)
# in case of error).
#
# if (XML_TYPE == "BEGIN") {
# _xml_enter(XML_TAG);
# } else {
# _xml_exit(XML_TAG);
# }
#
# 5. Extracting attributes:
#
# A loop is implemented to parse attributes, the idea is to get the attribute
# name, which is always followed by a '=' character:
#
# _xml_attrib = $0; # copy record.
# sub(/=.*$/,"",_xml_attrib); # get rid of '=' and anything after.
# sub(/^[^=]*/,"",$0); # remove attribute name from $0
# _xml_attrib = tolower(_xml_attrib);
# if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
# _xml_panic("Invalid attribute name: " _xml_attrib);
#
# Now get the value, which is enclosed by either (") or (')
#
# if (substr($0,1,2) == "=\"") { # if $0 begins with ="
# _xml_value = substr($0,3); # extract value
# sub(/".*$/,"",_xml_value);
# sub(/^="[^"]*"/,"",$0); # remove it from $0
# } else if (substr($0,1,2) == "='") { # if $0 begins with ='
# _xml_value = substr($0,3); # extract value
# sub(/'.*$/,"",_xml_value);
# sub(/^='[^']*'/,"",$0); # remove it from $0
# } else {
# -> panic (malformed input)
# }
#
# After that, we simply store the value into the XML_ATTR associative
# array, and cleanup $0 from leading spaces:
#
# XML_ATTR[_xml_attrib] = _xml_value;
# sub(/^[ \t\n]*/,"",$0);
#
#
# 6. Handling direct tag closure:
#
# When a tag is closed directly (as in <foo/>), A single '/' will be
# parsed in the attribute parsing loop. We need to record this for the
# next call to xml_event(), since the current one should return a"BEGIN"
# for the "FOO" tag instead.
#
# We do this by setting the special _xml_closing variable, as in:
#
# if ($0 == "/") {
# # record a delayed tag closure for the next call
# _xml_closing = XML_TAG;
# break
# }
#
# This variable is checked at the start of xml_event() like this:
#
# # delayed tag closure - see below
# if (_xml_closing) {
# XML_TAG = _xml_closing;
# XML_TYPE = "END";
# _xml_closing = "";
# _xml_exit(XML_TAG);
# return 1;
# }
#
# Note the call to _xml_exit() to update XML_RPATH here.
#