build/awk/xml.awk - platform/ndk - Git at Google

 # Copyright (C) 2010 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 # Tiny XML parser implementation in awk.
 #
 # This file is not meant to be used directly, instead copy the
 # functions it defines here into your own script then specialize
 # it appropriately.
 #

 # See further below for usage instructions and implementation details.
 #

 # ---------------------------- cut here ---------------------------

 function xml_event () {
     RS=">";
     XML_TAG=XML_TYPE="";
     split("", XML_ATTR);
     while ( 1 ) {
         if (_xml_closing) { # delayed direct tag closure
             XML_TAG = _xml_closing;
             XML_TYPE = "END";
             _xml_closing = "";
             _xml_exit(XML_TAG);
             return 1;
         }
         if (getline <= 0) return 0; # read new input line
         _xml_p = index($0, "<"); # get start marker
         if (_xml_p == 0) return 0; # end of file (or malformed input)
         $0 = substr($0, _xml_p) # remove anything before '<'
         # ignore CData / Comments / Processing instructions / Declarations
         if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") ||
             _xml_in_section("<!--", "--") ||
             _xml_in_section("<\\?", "\\?") ||
             _xml_in_section("<!", "")) {
             continue;
         }
         if (substr($0, 1, 2) == "</") { # is it a closing tag ?
             XML_TYPE = "END";
             $0 = substr($0, 3);
         } else { # nope, it's an opening one
             XML_TYPE = "BEGIN";
             $0 = substr($0, 2);
         }
         XML_TAG = $0
         sub("[ \n\t/].*$", "", XML_TAG);  # extract tag name
         XML_TAG = toupper(XML_TAG);       # uppercase it
         if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ )  # validate it
             _xml_panic("Invalid tag name: " XML_TAG);
         if (XML_TYPE == "BEGIN") {  # update reverse path
             _xml_enter(XML_TAG);
         } else {
             _xml_exit(XML_TAG);
         }
         sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces
         while ($0) { # process attributes
             if ($0 == "/") {  # deal with direct closing tag, e.g. </foo>
                 _xml_closing = XML_TAG; # record delayed tag closure.
                 break
             }
             _xml_attrib = $0;
             sub(/=.*$/,"",_xml_attrib);  # extract attribute name
             sub(/^[^=]*/,"",$0);         # remove it from record
             _xml_attrib = tolower(_xml_attrib);
             if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
                 _xml_panic("Invalid attribute name: " _xml_attrib);
             if (substr($0,1,2) == "=\"") { # value is ="something"
                 _xml_value = substr($0,3);
                 sub(/".*$/,"",_xml_value);
                 sub(/^="[^"]*"/,"",$0);
             } else if (substr($0,1,2) == "='") { # value is ='something'
                 _xml_value = substr($0,3);
                 sub(/'.*$/,"",_xml_value);
                 sub(/^='[^']*'/,"",$0);
             } else {
                 _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
             }
             XML_ATTR[_xml_attrib] = _xml_value;  # store attribute name/value
             sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces
         }
         return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
     }
 }

 function _xml_panic (msg) {
     print msg > "/dev/stderr"
     exit(1)
 }

 function _xml_in_section (sec_begin, sec_end) {
     if (!match( $0, "^" sec_begin )) return 0;
     while (!match($0, sec_end "$")) {
         if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
     }
     return 1;
 }

 function _xml_enter (tag) {
     XML_RPATH = tag "/" XML_RPATH;
 }

 function _xml_exit (tag) {
     _xml_p = index(XML_RPATH, "/");
     _xml_expected = substr(XML_RPATH, 1, _xml_p-1);
     if (_xml_expected != XML_TAG)
         _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
     XML_RPATH = substr(XML_RPATH, _xml_p+1);
 }

 # ---------------------------- cut here ---------------------------

 # USAGE:
 #
 # The functions provided here are used to extract the tags and attributes of a
 # given XML file. They do not support extraction of data, CDATA, comments,
 # processing instructions and declarations at all.
 #
 # You should use this from the BEGIN {} action of your awk script (it will
 # not work from an END {} action).
 #
 # Call xml_event() in a while loop. This functions returns 1 for each XML
 # 'event' encountered, or 0 when the end of input is reached. Note that in
 # case of malformed output, an error will be printed and the script will
 # force an exit(1)
 #
 # After each succesful xml_event() call, the following variables will be set:
 #
 #    XML_TYPE:  type of event: "BEGIN" -> mean an opening tag, "END" a
 #               closing one.
 #
 #    XML_TAG:   name of the tag, always in UPPERCASE!
 #
 #    XML_ATTR:  a map of attributes for the type. Only set for "BEGIN" types.
 #               all attribute names are in lowercase.
 #
 #               beware: values are *not* unescaped !
 #
 #    XML_RPATH: the _reversed_ element path, using "/" as a separator.
 #               if you are within the <manifest><application> tag, then
 #               it will be set to "APPLICATION/MANIFEST/"
 #               (note the trailing slash).
 #

 # This is a simple example that dumps the output of the parsing.
 #
 BEGIN {
     while ( xml_event() ) {
         printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
         if (XML_TYPE == "BEGIN") {
             for (attr in XML_ATTR) {
                 printf " %s='%s'", attr, XML_ATTR[attr];
             }
         }
         printf "\n";
     }
 }

 # IMPLEMENTATION DETAILS:
 #
 # 1. '>' as the record separator:
 #
 # RS is set to '>' to use this character as the record separator, instead of
 # the default '\n'. This means that something like the following:
 #
 #   <foo><bar attrib="value">stuff</bar></foo>
 #
 # will be translated into the following successive 'records':
 #
 #  <foo
 #  <bar attrib="value"
 #  stuff</bar
 #  </foo
 #
 # Note that the '>' is never part of the records and thus will not be matched.
 # If the record does not contain a single '<', the input is either
 # malformed XML, or we reached the end of file with data after the last
 # '>'.
 #
 # Newlines in the original input are kept in the records as-is.
 #
 # 2. Getting rid of unwanted stuff:
 #
 # We don't need any of the data within elements, so we get rid of them by
 # simply ignoring anything before the '<' in the current record. This is
 # done with code like this:
 #
 #     p = index($0, "<");       # get index of '<'
 #     if (p == 0) -> return 0;  # malformed input or end of file
 #     $0 = substr($0, p+1);     # remove anything before the '<' in record
 #
 # We also want to ignore certain sections like CDATA, comments, declarations,
 # etc.. These begin with a certain pattern and end with another one, e.g.
 # "<!--" and "-->" for comments. This is handled by the _xml_in_section()
 # function that accepts two patterns as input:
 #
 #    sec_begin: is the pattern for the start of the record.
 #    sec_end:   is the pattern for the end of the record (minus trailing '>').
 #
 # The function deals with the fact that these section can embed a valid '>'
 # and will then span multiple records, i.e. something like:
 #
 #  <!-- A comment with an embedded > right here ! -->
 #
 # will be decomposed into two records:
 #
 #   "<!-- A comment with an embedded "
 #   " right here ! --"
 #
 # The function deals with this case, and exits when such a section is not
 # properly terminated in the input.
 #
 # _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
 #
 # 3. Extracting the tag name:
 #
 # </foo> is a closing tag, and <foo> an opening tag, this is handled
 # by the following code:
 #
 #       if (substr($0, 1, 2) == "</") {
 #           XML_TYPE = "END";
 #           $0 = substr($0, 3);
 #       } else {
 #           XML_TYPE = "BEGIN";
 #           $0 = substr($0, 2);
 #       }
 #
 # which defines XML_TYPE, and removes the leading "</" or "<" from the record.
 # The tag is later extracted and converted to uppercase with:
 #
 #       XML_TAG = $0                      # copy record
 #       sub("[ \n\t/].*$", "", XML_TAG);  # remove anything after tag name
 #       XML_TAG = toupper(XML_TAG);       # conver to uppercase
 #       # validate tag
 #       if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
 #
 # Then the record is purged from the tag name and the spaces after it:
 #
 #       # get rid of tag and spaces after it in $0
 #       sub("[^ \n\t]*[ \n\t]*", "", $0);
 #
 # 4. Maintaining XML_RPATH:
 #
 # The _xml_enter() and _xml_exit() functions are called to maintain the
 # XML_RPATH variable when entering and exiting specific tags. _xml_exit()
 # will also validate the input, checking proper tag enclosure (or exit(1)
 # in case of error).
 #
 #       if (XML_TYPE == "BEGIN") {
 #           _xml_enter(XML_TAG);
 #       } else {
 #           _xml_exit(XML_TAG);
 #       }
 #
 # 5. Extracting attributes:
 #
 # A loop is implemented to parse attributes, the idea is to get the attribute
 # name, which is always followed by a '=' character:
 #
 #           _xml_attrib = $0;              # copy record.
 #           sub(/=.*$/,"",_xml_attrib);    # get rid of '=' and anything after.
 #           sub(/^[^=]*/,"",$0);           # remove attribute name from $0
 #           _xml_attrib = tolower(_xml_attrib);
 #           if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
 #               _xml_panic("Invalid attribute name: " _xml_attrib);
 #
 # Now get the value, which is enclosed by either (") or (')
 #
 #          if (substr($0,1,2) == "=\"") {        # if $0 begins with ="
 #               _xml_value = substr($0,3);       # extract value
 #               sub(/".*$/,"",_xml_value);
 #               sub(/^="[^"]*"/,"",$0);          # remove it from $0
 #           } else if (substr($0,1,2) == "='") { # if $0 begins with ='
 #               _xml_value = substr($0,3);       # extract value
 #               sub(/'.*$/,"",_xml_value);
 #               sub(/^='[^']*'/,"",$0);          # remove it from $0
 #           } else {
 #               -> panic (malformed input)
 #           }
 #
 # After that, we simply store the value into the XML_ATTR associative
 # array, and cleanup $0 from leading spaces:
 #
 #           XML_ATTR[_xml_attrib] = _xml_value;
 #           sub(/^[ \t\n]*/,"",$0);
 #
 #
 # 6. Handling direct tag closure:
 #
 # When a tag is closed directly (as in <foo/>), A single '/' will be
 # parsed in the attribute parsing loop. We need to record this for the
 # next call to xml_event(), since the current one should return a"BEGIN"
 # for the "FOO" tag instead.
 #
 # We do this by setting the special _xml_closing variable, as in:
 #
 #          if ($0 == "/") {
 #               # record a delayed tag closure for the next call
 #               _xml_closing = XML_TAG;
 #               break
 #           }
 #
 # This variable is checked at the start of xml_event() like this:
 #
 #       # delayed tag closure - see below
 #       if (_xml_closing) {
 #           XML_TAG = _xml_closing;
 #           XML_TYPE = "END";
 #           _xml_closing = "";
 #           _xml_exit(XML_TAG);
 #           return 1;
 #       }
 #
 # Note the call to _xml_exit() to update XML_RPATH here.
 #
	# Copyright (C) 2010 The Android Open Source Project
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	# Tiny XML parser implementation in awk.
	#
	# This file is not meant to be used directly, instead copy the
	# functions it defines here into your own script then specialize
	# it appropriately.
	#

	# See further below for usage instructions and implementation details.
	#

	# ---------------------------- cut here ---------------------------

	function xml_event () {
	RS=">";
	XML_TAG=XML_TYPE="";
	split("", XML_ATTR);
	while ( 1 ) {
	if (_xml_closing) { # delayed direct tag closure
	XML_TAG = _xml_closing;
	XML_TYPE = "END";
	_xml_closing = "";
	_xml_exit(XML_TAG);
	return 1;
	}
	if (getline <= 0) return 0; # read new input line
	_xml_p = index($0, "<"); # get start marker
	if (_xml_p == 0) return 0; # end of file (or malformed input)
	$0 = substr($0, _xml_p) # remove anything before '<'
	# ignore CData / Comments / Processing instructions / Declarations
	if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") \|\|
	_xml_in_section("<!--", "--") \|\|
	_xml_in_section("<\\?", "\\?") \|\|
	_xml_in_section("<!", "")) {
	continue;
	}
	if (substr($0, 1, 2) == "</") { # is it a closing tag ?
	XML_TYPE = "END";
	$0 = substr($0, 3);
	} else { # nope, it's an opening one
	XML_TYPE = "BEGIN";
	$0 = substr($0, 2);
	}
	XML_TAG = $0
	sub("[ \n\t/].*$", "", XML_TAG); # extract tag name
	XML_TAG = toupper(XML_TAG); # uppercase it
	if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) # validate it
	_xml_panic("Invalid tag name: " XML_TAG);
	if (XML_TYPE == "BEGIN") { # update reverse path
	_xml_enter(XML_TAG);
	} else {
	_xml_exit(XML_TAG);
	}
	sub("[^ \n\t][ \n\t]", "", $0); # get rid of tag and spaces
	while ($0) { # process attributes
	if ($0 == "/") { # deal with direct closing tag, e.g. </foo>
	_xml_closing = XML_TAG; # record delayed tag closure.
	break
	}
	_xml_attrib = $0;
	sub(/=.*$/,"",_xml_attrib); # extract attribute name
	sub(/^[^=]*/,"",$0); # remove it from record
	_xml_attrib = tolower(_xml_attrib);
	if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
	_xml_panic("Invalid attribute name: " _xml_attrib);
	if (substr($0,1,2) == "=\"") { # value is ="something"
	_xml_value = substr($0,3);
	sub(/".*$/,"",_xml_value);
	sub(/^="[^"]*"/,"",$0);
	} else if (substr($0,1,2) == "='") { # value is ='something'
	_xml_value = substr($0,3);
	sub(/'.*$/,"",_xml_value);
	sub(/^='[^']*'/,"",$0);
	} else {
	_xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
	}
	XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value
	sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces
	}
	return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
	}
	}

	function _xml_panic (msg) {
	print msg > "/dev/stderr"
	exit(1)
	}

	function _xml_in_section (sec_begin, sec_end) {
	if (!match( $0, "^" sec_begin )) return 0;
	while (!match($0, sec_end "$")) {
	if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
	}
	return 1;
	}

	function _xml_enter (tag) {
	XML_RPATH = tag "/" XML_RPATH;
	}

	function _xml_exit (tag) {
	_xml_p = index(XML_RPATH, "/");
	_xml_expected = substr(XML_RPATH, 1, _xml_p-1);
	if (_xml_expected != XML_TAG)
	_xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
	XML_RPATH = substr(XML_RPATH, _xml_p+1);
	}

	# ---------------------------- cut here ---------------------------

	# USAGE:
	#
	# The functions provided here are used to extract the tags and attributes of a
	# given XML file. They do not support extraction of data, CDATA, comments,
	# processing instructions and declarations at all.
	#
	# You should use this from the BEGIN {} action of your awk script (it will
	# not work from an END {} action).
	#
	# Call xml_event() in a while loop. This functions returns 1 for each XML
	# 'event' encountered, or 0 when the end of input is reached. Note that in
	# case of malformed output, an error will be printed and the script will
	# force an exit(1)
	#
	# After each succesful xml_event() call, the following variables will be set:
	#
	# XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a
	# closing one.
	#
	# XML_TAG: name of the tag, always in UPPERCASE!
	#
	# XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types.
	# all attribute names are in lowercase.
	#
	# beware: values are not unescaped !
	#
	# XML_RPATH: the _reversed_ element path, using "/" as a separator.
	# if you are within the <manifest><application> tag, then
	# it will be set to "APPLICATION/MANIFEST/"
	# (note the trailing slash).
	#

	# This is a simple example that dumps the output of the parsing.
	#
	BEGIN {
	while ( xml_event() ) {
	printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
	if (XML_TYPE == "BEGIN") {
	for (attr in XML_ATTR) {
	printf " %s='%s'", attr, XML_ATTR[attr];
	}
	}
	printf "\n";
	}
	}

	# IMPLEMENTATION DETAILS:
	#
	# 1. '>' as the record separator:
	#
	# RS is set to '>' to use this character as the record separator, instead of
	# the default '\n'. This means that something like the following:
	#
	# <foo><bar attrib="value">stuff</bar></foo>
	#
	# will be translated into the following successive 'records':
	#
	# <foo
	# <bar attrib="value"
	# stuff</bar
	# </foo
	#
	# Note that the '>' is never part of the records and thus will not be matched.
	# If the record does not contain a single '<', the input is either
	# malformed XML, or we reached the end of file with data after the last
	# '>'.
	#
	# Newlines in the original input are kept in the records as-is.
	#
	# 2. Getting rid of unwanted stuff:
	#
	# We don't need any of the data within elements, so we get rid of them by
	# simply ignoring anything before the '<' in the current record. This is
	# done with code like this:
	#
	# p = index($0, "<"); # get index of '<'
	# if (p == 0) -> return 0; # malformed input or end of file
	# $0 = substr($0, p+1); # remove anything before the '<' in record
	#
	# We also want to ignore certain sections like CDATA, comments, declarations,
	# etc.. These begin with a certain pattern and end with another one, e.g.
	# "<!--" and "-->" for comments. This is handled by the _xml_in_section()
	# function that accepts two patterns as input:
	#
	# sec_begin: is the pattern for the start of the record.
	# sec_end: is the pattern for the end of the record (minus trailing '>').
	#
	# The function deals with the fact that these section can embed a valid '>'
	# and will then span multiple records, i.e. something like:
	#
	# <!-- A comment with an embedded > right here ! -->
	#
	# will be decomposed into two records:
	#
	# "<!-- A comment with an embedded "
	# " right here ! --"
	#
	# The function deals with this case, and exits when such a section is not
	# properly terminated in the input.
	#
	# _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
	#
	# 3. Extracting the tag name:
	#
	# </foo> is a closing tag, and <foo> an opening tag, this is handled
	# by the following code:
	#
	# if (substr($0, 1, 2) == "</") {
	# XML_TYPE = "END";
	# $0 = substr($0, 3);
	# } else {
	# XML_TYPE = "BEGIN";
	# $0 = substr($0, 2);
	# }
	#
	# which defines XML_TYPE, and removes the leading "</" or "<" from the record.
	# The tag is later extracted and converted to uppercase with:
	#
	# XML_TAG = $0 # copy record
	# sub("[ \n\t/].*$", "", XML_TAG); # remove anything after tag name
	# XML_TAG = toupper(XML_TAG); # conver to uppercase
	# # validate tag
	# if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
	#
	# Then the record is purged from the tag name and the spaces after it:
	#
	# # get rid of tag and spaces after it in $0
	# sub("[^ \n\t][ \n\t]", "", $0);
	#
	# 4. Maintaining XML_RPATH:
	#
	# The _xml_enter() and _xml_exit() functions are called to maintain the
	# XML_RPATH variable when entering and exiting specific tags. _xml_exit()
	# will also validate the input, checking proper tag enclosure (or exit(1)
	# in case of error).
	#
	# if (XML_TYPE == "BEGIN") {
	# _xml_enter(XML_TAG);
	# } else {
	# _xml_exit(XML_TAG);
	# }
	#
	# 5. Extracting attributes:
	#
	# A loop is implemented to parse attributes, the idea is to get the attribute
	# name, which is always followed by a '=' character:
	#
	# _xml_attrib = $0; # copy record.
	# sub(/=.*$/,"",_xml_attrib); # get rid of '=' and anything after.
	# sub(/^[^=]*/,"",$0); # remove attribute name from $0
	# _xml_attrib = tolower(_xml_attrib);
	# if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
	# _xml_panic("Invalid attribute name: " _xml_attrib);
	#
	# Now get the value, which is enclosed by either (") or (')
	#
	# if (substr($0,1,2) == "=\"") { # if $0 begins with ="
	# _xml_value = substr($0,3); # extract value
	# sub(/".*$/,"",_xml_value);
	# sub(/^="[^"]*"/,"",$0); # remove it from $0
	# } else if (substr($0,1,2) == "='") { # if $0 begins with ='
	# _xml_value = substr($0,3); # extract value
	# sub(/'.*$/,"",_xml_value);
	# sub(/^='[^']*'/,"",$0); # remove it from $0
	# } else {
	# -> panic (malformed input)
	# }
	#
	# After that, we simply store the value into the XML_ATTR associative
	# array, and cleanup $0 from leading spaces:
	#
	# XML_ATTR[_xml_attrib] = _xml_value;
	# sub(/^[ \t\n]*/,"",$0);
	#
	#
	# 6. Handling direct tag closure:
	#
	# When a tag is closed directly (as in <foo/>), A single '/' will be
	# parsed in the attribute parsing loop. We need to record this for the
	# next call to xml_event(), since the current one should return a"BEGIN"
	# for the "FOO" tag instead.
	#
	# We do this by setting the special _xml_closing variable, as in:
	#
	# if ($0 == "/") {
	# # record a delayed tag closure for the next call
	# _xml_closing = XML_TAG;
	# break
	# }
	#
	# This variable is checked at the start of xml_event() like this:
	#
	# # delayed tag closure - see below
	# if (_xml_closing) {
	# XML_TAG = _xml_closing;
	# XML_TYPE = "END";
	# _xml_closing = "";
	# _xml_exit(XML_TAG);
	# return 1;
	# }
	#
	# Note the call to _xml_exit() to update XML_RPATH here.
	#