blob: 50c780aaad06071aff80e8b0b818061f9b5d0d0a [file] [log] [blame]
/*---------------------------------------------------------------------------*
* grxmldoc.cpp *
* *
* Copyright 2007, 2008 Nuance Communciations, Inc. *
* *
* Licensed under the Apache License, Version 2.0 (the 'License'); *
* you may not use this file except in compliance with the License. *
* *
* You may obtain a copy of the License at *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an 'AS IS' BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* *
*---------------------------------------------------------------------------*/
#include <assert.h>
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm> // for std::sort
#include "tinyxml.h"
#include "grph.h" // The word graph object and interface
#include "sub_grph.h" // The sub-graph object and interface
#include "hashmap.h"
#include "grxmldoc.h"
#include "ESR_Session.h"
//#include "LCHAR.h"
#define GRXML_DEBUG 0
#define MAX_PATH_NAME 512
#define FATAL_ERROR(x,y) { std::cout << (x) << std::endl; exit ((y)); }
#define WARNING(x) std::cout << (x) << std::endl;
#if GRXML_DEBUG
//#define DEBUG_PRINT(x) //
#define DEBUG_PRINT(x) std::cout << (x) << std::endl;
#define PRINT_EXPRESSION(x)
//#define PRINT_EXPRESSION(x) std::cout << (x) << std::endl;
#else
#define DEBUG_PRINT(x) //
#define PRINT_EXPRESSION(x) //
#endif
using namespace std;
#define CHECK_NOT_EMPTY(s, t) { if (s.empty()) \
{ \
std::cout << "ERROR: Empty string of type " << t <<std::endl; \
} \
}
int get_range(const std::string& s, int* minCnt, int* maxCnt)
{
std::string sval;
unsigned int p1 =s.find("-");
if ( p1 !=string::npos ) {
sval.assign( s, 0, p1 );
if(strspn(sval.c_str(),"0123456789")<1) return 1;
*minCnt = atoi( sval.c_str() );
sval.assign( s, p1+1, s.size() );
*maxCnt = -1; // 0== any?
// If max is given then use BeginCount otherwise use BeginItemRepeat
if (!sval.empty() ) {
if(strspn(sval.c_str(),"0123456789")<1) return 1;
*maxCnt = atoi( sval.c_str() );
}
return 0;
}
p1 = s.find("+");
if( p1 != string::npos) {
sval.assign( s, 0, p1 );
if(strspn(sval.c_str(),"0123456789")<1) return 1;
*minCnt = atoi( sval.c_str() );
*maxCnt = -1;
return 0;
}
if(strspn(s.c_str(),"0123456789")<1) return 1;
*minCnt = *maxCnt = atoi( s.c_str());
return 0;
}
GRXMLDoc::GRXMLDoc()
{
m_NodeKeyWords.insert(make_pair("grammar", NodeTypeGrammar));
m_NodeKeyWords.insert(make_pair("rule", NodeTypeRule));
m_NodeKeyWords.insert(make_pair("ruleref", NodeTypeRuleReference));
m_NodeKeyWords.insert(make_pair("one-of", NodeTypeOneOf));
m_NodeKeyWords.insert(make_pair("item", NodeTypeItem));
m_NodeKeyWords.insert(make_pair("tag", NodeTypeTag));
m_NodeKeyWords.insert(make_pair("count", NodeTypeCount));
m_NodeKeyWords.insert(make_pair("meta", NodeTypeMeta));
m_pGraph = 0;
m_RuleAutoIndex = 0;
m_TagAutoIndex = 0;
m_LabelAutoIndex = 0;
m_ExpandedRulesAutoIndex = 0;
m_XMLFileName = "dummy.xml";
}
GRXMLDoc::~GRXMLDoc()
{
deleteRules();
if (m_pGraph) {
delete m_pGraph;
}
}
bool GRXMLDoc::parseGrammar( XMLNode &node, std::string & xMLFileName )
{
m_XMLFileName = xMLFileName;
// Set up the internally defined rules, etc.
initializeLists();
// The top level "document" node is given to this fn
// Create the container for the word graph.
if (m_pGraph) {
delete m_pGraph;
}
m_pGraph = new Graph("XML grammar");
SubGraph *p_SubGraph;
parseNode( node, p_SubGraph, 1 ); // NB Subgraph pointed to will change in recursive fn.
if (findSubGraph( m_RootRule, p_SubGraph )) {
m_pGraph->ExpandRules (p_SubGraph);
p_SubGraph->RemoveInternalConnections ();
//Print the root rule.
//printSubgraph( *p_SubGraph );
}
return true;
}
bool GRXMLDoc::parseNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
{
// We will create a new subgraph for each rule node.
// The "current" subgraph is substituted with the new subgraph for all ops on child nodes.
// After processing child nodes the original subgraph is reinstated
// for final operations in the endNode() fn.
// Initial processing of the current node before processing children
#if 0 && GRXML_DEBUG
if(node.Type() == TiXmlNode::ELEMENT)
node.ToElement()->Print( stdout, level);
else if(node.Type() == TiXmlNode::DOCUMENT)
node.ToDocument()->Print( stdout, level);
else if(node.Type() == TiXmlNode::TEXT)
node.ToText()->Print( stdout, level);
else if(node.Type() == TiXmlNode::DECLARATION)
node.ToDeclaration()->Print( stdout, level);
else {
const char* text = node.Value();
if(!text) text = "__NULL__";
printf("processing node type %d text %s\n", node.Type(), text);
}
#endif
beginNode( node, p_SubGraph, level );
SubGraph *p_LocalSubGraph;
p_LocalSubGraph = p_SubGraph;
TiXmlNode* child;
for( child = node.FirstChild(); child; child = child->NextSibling() )
{
parseNode ( *child, p_SubGraph, level+1 );
}
// Revert current node
p_SubGraph = p_LocalSubGraph;
// Finish processing current node
endNode( node, p_SubGraph, level );
return true;
} // parseNode
bool GRXMLDoc::beginNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
{
std::string name = node.Value();
DEBUG_PRINT("Element = " + name);
// XMLNode::Type type = node.getType();
if ( node.Type() == TiXmlNode::TEXT) // isCData()
{
const char* cc_name = node.Parent()->Value();
std::string str_name(cc_name);
DEBUG_PRINT (std::string("CDATA ") + name);
DEBUG_PRINT (std::string("CDATA ") + str_name);
processCDATA( node, p_SubGraph );
}
else if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() /*isLeaf()*/)
{
//printNode(node, level);
// Use enum value
KEYWDPAIR::iterator pos;
pos = m_NodeKeyWords.find( name );
KeywordValues nodeType = NodeTypeBadValue;
if ( pos != m_NodeKeyWords.end() )
{
nodeType = (*pos).second;
DEBUG_PRINT("nodeType=" + nodeType);
} else if(node.Type() == TiXmlNode::COMMENT) {
return true;
} else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
return true;
} else {
FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT);
}
switch ( nodeType )
{
case NodeTypeGrammar:
{
beginParseGrammarNode( node );
}
break;
case NodeTypeRule:
{
// NB This fn creates a new subgraph.
beginParseRuleNode( node, p_SubGraph );
}
break;
case NodeTypeRuleReference:
{
// NB This fn creates a new subgraph.
beginRuleRef( node, p_SubGraph );
}
break;
case NodeTypeOneOf:
{
beginOneOf( node, p_SubGraph );
}
break;
case NodeTypeItem:
{
beginItem( node, p_SubGraph );
}
break;
case NodeTypeTag:
{
beginTag( node, p_SubGraph );
}
break;
case NodeTypeCount:
{
beginCount( node, p_SubGraph );
}
break;
case NodeTypeMeta:
{
beginParseMetaNode( node );
}
break;
case NodeTypeBadValue:
default:
DEBUG_PRINT( "UNKNOWN node name: " + name );
break;
}; // switch
} //is a Node or Leaf
else if ( node.Type() == TiXmlNode::TEXT) // isCData()
{
DEBUG_PRINT (std::string("CDATA ") + name);
processCDATA( node, p_SubGraph );
}
return true;
} // beginNode()
bool GRXMLDoc::endNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
{
std::string name = node.Value();
//XMLNode::Type type = node.getType();
if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() )
{
KEYWDPAIR::iterator pos;
pos = m_NodeKeyWords.find( name );
KeywordValues nodeType = NodeTypeBadValue;
if ( pos != m_NodeKeyWords.end() )
{
nodeType = (*pos).second;
} else if(node.Type() == TiXmlNode::COMMENT) {
return true;
} else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
return true;
} else if(node.Type() == TiXmlNode::TEXT) {
} else {
FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT );
}
switch ( nodeType )
{
case NodeTypeGrammar:
{
endParseGrammarNode( node );
}
break;
case NodeTypeRule:
{
endParseRuleNode( node, p_SubGraph );
}
break;
case NodeTypeRuleReference:
{
endRuleRef( node, p_SubGraph );
}
break;
case NodeTypeOneOf:
{
endOneOf( node, p_SubGraph );
}
break;
case NodeTypeItem:
{
endItem(node, p_SubGraph );
}
break;
case NodeTypeTag:
{
endTag( node, p_SubGraph );
}
break;
case NodeTypeCount:
{
endCount( node, p_SubGraph );
}
break;
case NodeTypeMeta:
{
endParseMetaNode( node );
}
break;
case NodeTypeBadValue:
default:
DEBUG_PRINT( "UNKNOWN node name: ");
DEBUG_PRINT( name.c_str() );
//Extend the
break;
}; // switch
} //isNode() or isLeaf()
else
{
// Do nothing?
}
return true;
} // endNode()
bool GRXMLDoc::beginParseGrammarNode(XMLNode &node)
{
const char* attr;
#define GETATTR(nAmE) ((attr=node.ToElement()->Attribute(nAmE))!=NULL) ? attr:""
m_XMLMode = GETATTR("mode");
m_XMLLanguage = GETATTR("xml:lang");
m_RootRule = GETATTR("root"); // The root rule name
DEBUG_PRINT("Root rule = " + m_RootRule);
m_XMLTagFormat = GETATTR("tag-format");
m_XMLVersion = GETATTR("version");
m_XMLBase = GETATTR("xml:base");
return true;
}
bool GRXMLDoc::beginParseMetaNode(XMLNode &node)
{
const char* attr;
std::string meta_name = GETATTR("name");
std::string meta_value = GETATTR("content");
if(meta_name == "word_penalty") {
m_MetaKeyValPairs.insert(meta_name,meta_value);
// m_MetaKeyValPairs.print();
} else if(meta_name == "do_skip_interword_silence") {
for(int j = 0; j<(int)meta_value.size(); j++){
meta_value[j] = tolower(meta_value[j]); //lower();
}
if(meta_value!="true" && meta_value!="false")
printf ("\nWarning: %s must be set to 'true' or 'false'; defaulting to 'false'\n", meta_name.c_str());
else
m_MetaKeyValPairs.insert(meta_name,meta_value);
} else if(meta_name == "userdict_name") {
printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
} else {
printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
}
return true;
}
bool GRXMLDoc::endParseGrammarNode(XMLNode &node)
{
// End parse operations
return true;
}
bool GRXMLDoc::beginParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph)
{
const char* attr;
// Note: The subGraph may change if there are forward references. This
// is fine as we revert to the previous one when finished parsing the current node.
DEBUG_PRINT ( "---- Rule\n" );
std::string ruleName = GETATTR("id" );
std::string s_tag = GETATTR("tag" );
if( s_tag.length()>0) {
FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
}
CHECK_NOT_EMPTY( ruleName, "id" );
// Rule name must be unique within scope of entire grammar.
// Put rule on stack - for context
m_RuleListStack.push( ruleName );
// Check whether a ruleref placeholder exists for this rule.
int index;
bool foundRule = findRuleIndex( ruleName, index );
if (foundRule) {
// Rule is already declared; it must have been forward referenced
// so swap the placeholder subgraph in.
// NB subgraph and rule name are already known to lists.
SubGraph *p_ExistingSubgraph;
if ( findSubGraph( ruleName, p_ExistingSubgraph ) ) {
p_SubGraph = p_ExistingSubgraph;
}
else {
FATAL_ERROR("ERROR! Subgraph without rule name entry found!", -1);
}
}
else {
// Create a Word Graph node for each rule node
SubGraph *newGraph;
addRuleToList( ruleName, newGraph );
p_SubGraph = newGraph;
}
// Make a note of the scope or rules; public, etc - used in map file.
findRuleIndex( ruleName, index );
std::string ruleScope = GETATTR("scope" );
if ( !ruleScope.empty() ) {
m_RuleScope.insert(index, ruleScope);
}
// We must accommodate Rules that have CDATA without an <item> element.
// We need to infer this element for all rules.
m_pGraph->BeginItem( p_SubGraph );
PRINT_EXPRESSION( ruleName + " = { " );
return true;
} // beginParseRuleNode()
bool GRXMLDoc::endParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph )
{
// The rule expression has been built as a subgraph and ID added to the rule list.
// Finished editing subgraph
DEBUG_PRINT ( "---- /Rule\n" );
//m_pGraph->EndRule(&p_SubGraph);
// Tell the world
//std::string ruleName = attr.get( "id" );
std::string ruleName = m_RuleListStack.top();
m_RuleListStack.pop();
//CHECK_NOT_EMPTY( ruleName, "id" );
// Must be unique rule name within scope of entire grammar.
// Check whether a ruleref placeholder exists for this rule.
m_pGraph->addSubGraph ( p_SubGraph );
// We must accommodate Rules that have CDATA without an <item> element.
// We need to infer this element for all rules.
m_pGraph->EndItem( p_SubGraph );
PRINT_EXPRESSION( " }\n" );
return true;
}
bool GRXMLDoc::processCDATA( XMLNode &node, SubGraph *&p_SubGraph )
{
// Note the Item's CDATA
// Strip leading and trailing whitespace
const char* cc_name = node.Parent()->Value();
std::string str_name(cc_name); // = node.Parent()->ValueStr(); // getName
// std::string name = node.Parent()->Value(); // getName
//if ( name == "item" ) {
if ( str_name != "tag" ) {
const char* const whitespace = " \t\r\n\v\f";
std::string cdata = node.Value(); // getCData()
std::string word; // Words are whitespace separated
cdata.erase(0, cdata.find_first_not_of(whitespace) );
cdata.erase(cdata.find_last_not_of(whitespace) + 1);
#if GRXML_DEBUG
std::cout << "/--" << cdata << "--/\n";
#endif
std::string::size_type begIdx, endIdx;
//search beginning of the first word
begIdx = cdata.find_first_not_of(whitespace);
//while beginning of a word found
while (begIdx != std::string::npos) {
//search end of the actual word
endIdx = cdata.find_first_of (whitespace, begIdx);
if (endIdx == string::npos) {
//end of word is end of line
endIdx = cdata.length();
}
word.clear();
// word.assign(cdata,begIdx,endIdx);
word.append (cdata, begIdx, endIdx - begIdx);
if ( !word.empty() )
{
#if GRXML_DEBUG
std::cout << " -->" << word << "<--\n";
#endif
int index;
// If a slot then take note of rule name
if ( IsSlot( word ) ) {
const char* xmlBasename;
std::string ruleName = m_RuleListStack.top();
m_SlotList.insert(index, ruleName);
xmlBasename = strrchr(m_XMLFileName.c_str(),'/');
xmlBasename = xmlBasename ? xmlBasename+1 : m_XMLFileName.c_str();
word = (std::string)xmlBasename + "." + ruleName + "@" + word;
addLabelToList( word );
findLabelIndex( word, index );
} else {
addLabelToList( word );
findLabelIndex( word, index );
}
m_pGraph->AddLabel( p_SubGraph, index );
}
begIdx = cdata.find_first_not_of (whitespace, endIdx);
}
} //tag
else {
// Do nothing with CDATA for elements that are not items.
// In particular, do not strip whitespace from tag cdata.
// However, CPPDOM appears to remove linefeeds. May need to tidy up.
}
return true;
} // cdata
bool GRXMLDoc::beginItem( XMLNode &node, SubGraph *&p_SubGraph )
{
const char* attr;
DEBUG_PRINT ("---- Item:\n");
// First check whethere there is a count/repeat
std::string s = GETATTR("repeat" );
int minCnt=0,maxCnt=0;
std::string s_tag = GETATTR("tag" );
if( s_tag.length()>0) {
FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
}
if( s.length()>0 && get_range( s, &minCnt, &maxCnt) ) {
FATAL_ERROR(std::string("error: while parsing range ") + s,1);
}
if ( !s.empty() ) {
// RED FLAG: max should not be 0! A +ve number should have been given.
if( maxCnt>0) {
m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
}
else {
// NB: BeginItemRepeat can only use min of 0 or 1!
m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
}
}
else {
m_pGraph->BeginItem( p_SubGraph );
}
return true;
}
bool GRXMLDoc::endItem( XMLNode &node, SubGraph *&p_SubGraph )
{
DEBUG_PRINT ( "---- /Item\n" );
// What TODO if no tag for an item?
m_pGraph->EndItem( p_SubGraph );
return true;
}
bool GRXMLDoc::beginRuleRef( XMLNode &node, SubGraph *&p_SubGraph )
{
// Extend word FST node with an entire FST subgraph.
// Forward referencing of rules is supported.
// NB Remove the leading # from the ruleref name!
DEBUG_PRINT ( "---- Ruleref\n" );
const char* attr;
std::string s_tag = GETATTR("tag" );
if( s_tag.length()>0) {
FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
}
std::string s = GETATTR("uri" );
if (s.empty())
{
//
FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
}
// Remove the #:
int p1 = s.find("#");
if ( p1 !=0 ) {
FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'" + ". Rule reference must start with a '#'. External references are not supported.", -1 );
}
string ruleName;
getRuleRefName( node, ruleName );
//std::string parentRuleName = m_RuleListStack.top();
//addRuleDependency( parentRuleName, ruleName );
int index;
bool foundRule = findRuleIndex( ruleName, index );
if (!foundRule) {
// Forward reference; create a placeholder subgraph ptr.
//SubGraph *newGraph = new SubGraph( (char *) ruleName.c_str() );
// RED FLAG: Remember to check fwd ref rule was filled in at end.
SubGraph *newGraph;
addRuleToList( ruleName, newGraph );
findRuleIndex( ruleName, index );
}
// We can now treat a forward-referenced graph as if it was defined.
// We will add the subgraph when we have the tag - see endItem().
m_pGraph->BeginRule( p_SubGraph );
m_pGraph->AddRuleRef( p_SubGraph, index );
m_pGraph->EndRule( p_SubGraph );
return true;
}
bool GRXMLDoc::endRuleRef(XMLNode &grmNode, SubGraph *&p_SubGraph )
{
DEBUG_PRINT ( "---- /Ruleref\n" );
// Does nothing
// NB The tag is not under the ruleref element - it is in the current item element.
// We now add the tag of the AddRuleRef as we see the tag element. See EndTag().
return true;
}
bool GRXMLDoc::beginOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
{
DEBUG_PRINT ( "----OneOf\n" );
m_pGraph->BeginOneOf (p_SubGraph);
return true;
}
bool GRXMLDoc::endOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
{
DEBUG_PRINT ( "----/OneOf\n" );
m_pGraph->EndOneOf (p_SubGraph);
return true;
}
bool GRXMLDoc::beginTag( XMLNode &node, SubGraph *&p_SubGraph )
{
DEBUG_PRINT ("---- Tag\n");
std::string s = node.ToElement()->GetText(); // getCdata();
#if GRXML_DEBUG
std::cout << s; // debug
#endif
// Store the semantic tag info.
// NB Do not strip whitespace from tag cdata
if ( !s.empty() )
{
int index;
addTagToList( s );
findTagIndex( s, index );
m_pGraph->AddTag ( p_SubGraph, index );
}
return true;
}
bool GRXMLDoc::endTag( XMLNode &node, SubGraph *&p_SubGraph )
{
DEBUG_PRINT ("---- /Tag\n");
return true;
}
bool GRXMLDoc::beginCount( XMLNode &node, SubGraph *&p_SubGraph )
{
const char* attr;
// Count of reps applies to the text elements in this count node
DEBUG_PRINT ("---- Count\n");
// Get number attr
std::string s = GETATTR("number");
std::string s_tag = GETATTR("tag" );
if( s_tag.length()>0) {
FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
}
if (s.empty()) {
return false;
}
// not in subgraph but in graph?!
//graph.BeginCount(n);
int minCnt=-1, maxCnt=-1;
if( get_range( s, &minCnt, &maxCnt) ) {
FATAL_ERROR(std::string("error: while parsing range ") + s,1);
}
if ( s.c_str() == std::string("optional") )
{
m_pGraph->BeginOptional( p_SubGraph );
}
else if ( minCnt>0 && maxCnt>0)
{
m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
}
else if( minCnt>0 )
{
m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
}
else { //
m_pGraph->BeginOptional ( p_SubGraph );
}
return true;
}
bool GRXMLDoc::endCount( XMLNode &node, SubGraph *&p_SubGraph )
{
DEBUG_PRINT ("---- /Count\n");
m_pGraph->EndCount( p_SubGraph );
return true;
}
bool GRXMLDoc::endParseMetaNode(XMLNode &node)
{
// End parse operations
return true;
}
void GRXMLDoc::printNode(XMLNode &node, int level)
{
std::string name = node.Value();
int type = node.Type();
std::string c_data;
for(int i=0;i<level;i++) std::cout << " ";
char c = ' ';
switch(type)
{
case TiXmlNode::ELEMENT:
// case XMLNode::xml_nt_node: // grammar, rule, one-of, item, count
c = '+';
break;
/* case TiXmlNode::TEXT:
// case XMLNode::xml_nt_leaf:
c = '-';
break; */
case TiXmlNode::DOCUMENT:
// case XMLNode::xml_nt_document:
c = '\\';
break;
case TiXmlNode::TEXT:
// case XMLNode::xml_nt_cdata:
c = '#';
c_data = node.Value(); // getCdata();
break;
case TiXmlNode::UNKNOWN:
case TiXmlNode::COMMENT:
case TiXmlNode::TYPECOUNT:
case TiXmlNode::DECLARATION:
default:
std::cout << "Error: not sure what to do here" << std::endl;
break;
}
if(node.Type() == TiXmlNode::TEXT) // isCData()
std::cout << c << name.c_str() << "[" << c_data << "]" << std::endl;
//Extend the tag hashtable
else
std::cout << c << name.c_str() << std::endl;
if( node.Type() == TiXmlNode::ELEMENT) {
for(TiXmlAttribute* attr=node.ToElement()->FirstAttribute();
attr; attr=attr->Next() ) {
// guru: added output of attributes
for (int i=0; i<level; i++)
std::cout << " ";
std::cout << " ";
std::cout << attr->Name() << ": " << attr->Value() << std::endl;
}
}
}
/** Function: addRuleToList
Extends list of SubGraphs with given subGraph
and extends list of rule names too.
TODO: Can we use one hash and use internal numeric index for rule IDs?
*/
bool GRXMLDoc::addRuleToList(std::string const & ruleName, SubGraph *&p_SubGraph)
{
int index;
if ( findRuleIndex ( ruleName, index ) ) {
FATAL_ERROR("ERROR! Rule name " + ruleName + " is already defined!", -1 );
}
addLabelToList( m_XMLFileName + "@" + ruleName);
findLabelIndex( m_XMLFileName + "@" + ruleName, index );
#if GRXML_DEBUG
std::cout << "Rule " << ruleName << std::endl;
#endif
// Create the new subgraph and update lists
m_RuleList.insert( ruleName, index );
p_SubGraph = new SubGraph( (char *) ruleName.c_str(), index );
bool success = m_SubgraphList.insert( ruleName, p_SubGraph );
if (!success) {
FATAL_ERROR("ERROR! subgraph for " + ruleName + " is already defined!", -1 );
}
#if ADD_BRACES
addLabelToList( "{" );
std::stringstream ss;
ss << "}(" << index << ")";
addLabelToList( ss.str());
#endif
return success;
}
bool GRXMLDoc::deleteRules()
{
// Delete all allocated subgraphs.
// The rule strings are part of the hashtables and get deleted by them.
int index;
SubGraph *p_SubGraph;
std::string ruleName;
while ( !m_RuleList.isEmpty() ) {
m_RuleList.getFirst( &ruleName, &index );
m_RuleList.remove( ruleName );
if (m_SubgraphList.getValue( ruleName, &p_SubGraph ) ) {
delete p_SubGraph;
}
else {
FATAL_ERROR("No subgraph for rule " + ruleName + "! Mismatched rules and subgraph hashtables!", -1);
}
}
m_SubgraphList.clear();
m_RuleList.clear();
m_LabelList.clear();
m_TagList.clear();
return true;
}
bool GRXMLDoc::findSubGraph(std::string & s, SubGraph *&p_SubGraph)
{
return m_SubgraphList.getValue(s, &p_SubGraph);
}
bool GRXMLDoc::findRule(int i, std::string &s )
{
return m_RuleList.getIndex( i, &s );
}
bool GRXMLDoc::findTag(int i, std::string &s )
{
return m_TagList.getValue( i, &s );
}
bool GRXMLDoc::findLabel(int i, std::string &s )
{
return m_LabelList.getValue( i, &s );
}
bool GRXMLDoc::findSubGraphIndex( SubGraph *p_SubGraph, std::string &s )
{
return m_SubgraphList.getIndex( p_SubGraph, &s );
}
bool GRXMLDoc::findRuleIndex( std::string s, int &i )
{
return m_RuleList.getValue( s, &i );
}
bool GRXMLDoc::findTagIndex( std::string s, int &i )
{
return m_TagList.getIndex( s, &i );
}
bool GRXMLDoc::findLabelIndex( std::string s, int &i )
{
return m_LabelList.getIndex( s, &i );
}
bool GRXMLDoc::findMeta(const std::string & sn, std::string &s)
{
return m_MetaKeyValPairs.getValue( sn, &s );
}
bool GRXMLDoc::setMeta(const std::string & sn, const std::string &s)
{
std::string tmp;
if(findMeta(sn,tmp))
m_MetaKeyValPairs.remove(sn);
return m_MetaKeyValPairs.insert(sn,s);
}
bool GRXMLDoc::addTagToList( std::string const& s )
{
bool success = true;
// Make values unique
int index;
if ( !findTagIndex( s, index ) )
success = m_TagList.insert( m_TagAutoIndex++, s );
return success;
}
bool GRXMLDoc::addLabelToList( std::string const& s )
{
// TODO: Labels should be unique. Change key.
int index;
bool bRes = m_LabelList.getIndex( s, &index );
if(bRes == true) {
return false; // exists
}
bRes = m_LabelList.insert( m_LabelAutoIndex++, s );
return bRes;
}
void GRXMLDoc::printLists()
{
m_SubgraphList.print();
m_RuleList.print();
m_TagList.print();
m_LabelList.print();
}
void GRXMLDoc::printSubgraphs()
{
SubGraph *p_SubGraph;
std::string rule;
int index;
if ( m_RuleList.getFirst( &rule, &index) ) {
if ( findSubGraph( rule, p_SubGraph ) ) {
DEBUG_PRINT("============ Rule: " + rule + "============");
printSubgraph( *p_SubGraph );
while ( m_RuleList.getNext( &rule, &index) ) {
if ( findSubGraph( rule, p_SubGraph ) ) {
printSubgraph( *p_SubGraph );
}
}
}
}
}
void GRXMLDoc::printSubgraph( SubGraph &p_SubGraph )
{
p_SubGraph.PrintWithLabels( *this );
}
bool GRXMLDoc::getRuleRefName(XMLNode &node, std::string &ruleName)
{
const char* attr;
std::string s = GETATTR("uri" );
if (s.empty()) {
FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
}
// Remove the #:
int p1 = s.find("#");
if ( p1 !=0 ) {
FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'", -1 );
}
ruleName.assign( s, 1, s.size() );
return true;
}
void GRXMLDoc::initializeLists()
{
m_SubgraphList.setName("Subgraphs");
m_RuleList.setName("Rules");
m_TagList.setName("Tags");
m_LabelList.setName("Labels");
/* Predefined rules. NB Labels are also created for each rule added.
// The required order for these labels in the .map output file is:
// 0 eps
// next come slots
// pau and pau2
// everything else
// We will add all these now in case they are referenced and we will
// reindex after we have parsed the grammar -- when we have the list
// of slots. This re-indexing is for the output files .map and .P.txt.
//
*/
addLabelToList( "eps" );
addLabelToList( "-pau-" );
addLabelToList( "-pau2-" );
}
void GRXMLDoc::writeMapFile( std::string & fileName )
{
// We need to re-index in order to put the labels in correct order:
// 1. eps
// 2. all slots
// 3. all rules
// 4. -pau- words
// 5. remaining labels
ofstream outfile;
int index, origIndex;
std::string label;
std::string slotRuleName;
std::string scope; // For rules
HashMap<int,std::string> orderedList;
int orderedIndex=0;
// 1. eps
orderedList.insert( orderedIndex++, "eps" );
// 2. slots
if ( m_LabelList.getFirst( &origIndex, &label ) ) {
if ( IsSlot( label ) ) {
orderedList.insert( orderedIndex++, label );
}
while (m_LabelList.getNext( &origIndex, &label ) ) {
if ( IsSlot( label ) ) {
orderedList.insert( orderedIndex++, label );
}
}
}
// 3. Now rules, or anything with @
if ( m_LabelList.getFirst( &origIndex, &label ) ) {
do {
#if GRXML_DEBUG
std::cout << label << " "<< label.find_first_of ("@") << std::endl;
#endif
if (!IsSlot(label) && label.find_first_of ("@") != string::npos) {
#if GRXML_DEBUG
std::cout << " Adding " << label << std::endl;
#endif
orderedList.insert( orderedIndex++, label );
}
} while (m_LabelList.getNext( &origIndex, &label ) );
}
// 4. pau
orderedList.insert( orderedIndex++, "-pau-" );
orderedList.insert( orderedIndex++, "-pau2-" );
// 5. Remaining stuff. NB We depend upon the label not
// being added twice.
if ( m_LabelList.getFirst( &origIndex, &label ) ) {
if ( !orderedList.getIndex( label, &index ) ) {
orderedList.insert( orderedIndex++, label );
}
while (m_LabelList.getNext( &origIndex, &label ) ) {
if ( !orderedList.getIndex( label, &index ) ) {
orderedList.insert( orderedIndex++, label );
}
}
}
outfile.open ( fileName.c_str() );
bool bRes = orderedList.getFirst( &index, &label );
do {
if(!bRes) break;
// Look up scope using original index
m_LabelList.getIndex( label, &origIndex );
if (m_RuleScope.getValue(origIndex, &scope) )
label = scope + ":" + label;
outfile << label << " " << index << std::endl;
bRes = orderedList.getNext( &index, &label );
} while(bRes);
outfile.close();
}
void GRXMLDoc::writeScriptFile( std::string & fileName )
{
ofstream outfile;
int index;
std::string label;
outfile.open ( fileName.c_str() );
if ( m_TagList.getFirst( &index, &label ) ) {
outfile << index << " " << label << std::endl;
}
while (m_TagList.getNext( &index, &label ) ) {
outfile << index << " " << label << std::endl;
}
outfile.close();
//m_LabelList.writeFile( fileName );
}
void GRXMLDoc::writeParamsFile( std::string & fileName )
{
std::string wtw;
ofstream outfile;
bool bRes;
outfile.open(fileName.c_str());
std::string metaname = "word_penalty";
bRes = findMeta(metaname, wtw);
if(bRes)
outfile << metaname.c_str() << "\t=\t" << wtw.c_str() << std::endl;
// outfile << "locale" << "\t=\t" << m_XMLLanguage << std::endl;
outfile.close();
}
void GRXMLDoc::writeGraphFiles( std::string& prefix, bool bDoWriteRecogGraphs)
{
SubGraph *p_SubGraph;
SubGraph *p_SemGraph;
std::string fileName;
if ( !findSubGraph( m_RootRule, p_SubGraph ) ) {
FATAL_ERROR ("ERROR: writeGraphFiles - no root rule "+ m_RootRule + " defined. No file created", -1 );
}
// Create .P.txt
printf ("\nCreating semantic graph file\n");
p_SemGraph = new SubGraph( (char *) "Main", -1);
m_pGraph->BeginRule( p_SemGraph );
m_pGraph->AddRuleRef( p_SemGraph, p_SubGraph->getRuleId());
m_pGraph->EndRule( p_SemGraph );
m_pGraph->ExpandRules (p_SemGraph);
p_SemGraph->RemoveInternalConnections ();
p_SemGraph->AddTerminalConnections ();
p_SemGraph->ReduceArcsByEquivalence();
p_SemGraph->RemoveUnreachedConnections (-1, -1);
p_SemGraph->DeterminizeArcs();
p_SemGraph->RemoveUnreachedConnections (-1, -1);
p_SemGraph->ReduceArcsByEquivalence();
p_SemGraph->RemoveUnreachedConnections (-1, -1);
fileName = prefix + ".P.txt";
p_SemGraph->WriteForwardGraphWithSemantic( fileName, *this );
delete p_SemGraph;
fileName = prefix + ".omap";
this->WriteOLabels(fileName);
}
void GRXMLDoc::sortLabels()
{
// We need to re-index in order to put the labels in correct order:
int index=0, origIndex;
std::string label;
std::string slotRuleName;
std::string scope; // For rules
std::vector <std::string> orderedList;
if ( m_LabelList.getFirst( &origIndex, &label ) ) {
// Look up scope using original index
orderedList.push_back( label );
while (m_LabelList.getNext( &origIndex, &label ) ) {
orderedList.push_back( label );
}
}
std::sort(orderedList.begin(), orderedList.end() );
m_SortedLabelList.clear();
index=0;
for (std::vector<std::string>::const_iterator citer = orderedList.begin();
citer != orderedList.end(); ++citer) {
label = *citer;
m_LabelList.getIndex( label, &origIndex );
m_SortedLabelList.insert( index, label );
index++;
// std::cout <<"Sorted: " << index <<" " << label <<std::endl;
}
return;
}
bool GRXMLDoc::findSortedLabel(int i, std::string &s )
{
if (m_SortedLabelList.isEmpty() ) {
sortLabels(); // Create the sorted label list.
}
return m_SortedLabelList.getValue( i, &s );
}
bool GRXMLDoc::findSortedLabelIndex( int i, int &sortedIndex )
{
std::string s;
if (m_SortedLabelList.isEmpty() ) {
sortLabels(); // Create the sorted label list.
}
if ( m_LabelList.getValue( i, &s ) ) {
if ( m_SortedLabelList.getIndex(s, &sortedIndex )) {
return true;
}
}
return false;
}
void GRXMLDoc::addOLabelToOList( std::string &s)
{
m_OutputPtxtLabels.insert( s, 0);
}
bool GRXMLDoc::WriteOLabels(const std::string& fileName)
{
HashMap<int,std::string> invMap;
int count = 0;
int max_script_label = 0;
int scriptID = 0;
std::map<std::string, int>::iterator iter;
bool bFound;
int tmp;
std::string strIndex = "eps";
bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
if(bFound)
m_OutputPtxtLabels.remove(strIndex);
m_OutputPtxtLabels.insert(strIndex, count);
invMap.insert( count, strIndex);
count++;
strIndex = "{";
bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
if(bFound)
m_OutputPtxtLabels.remove(strIndex);
m_OutputPtxtLabels.insert(strIndex, count);
invMap.insert( count, strIndex);
count++;
iter = m_OutputPtxtLabels.begin();
for( ; iter!=m_OutputPtxtLabels.end(); iter++) {
const char* label = iter->first.c_str();
if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)
&& strspn(label+SCRIPT_LABEL_PREFIX_LEN,"0123456789")==strlen(label+SCRIPT_LABEL_PREFIX_LEN) ) {
scriptID = atoi(label+SCRIPT_LABEL_PREFIX_LEN);
if(max_script_label < scriptID)
max_script_label = scriptID;
}/* else if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)) {
invMap.insert(count, iter->first);
iter->second = count;
count++;
}*/
else if(!invMap.getIndex((iter->first), &tmp)){
invMap.insert(count, iter->first);
iter->second = count;
count++;
}
}
cout << "found max_script_label " << max_script_label << endl;
for(int j=0; j<=max_script_label; j++) {
std::stringstream ss;
ss << SCRIPT_LABEL_PREFIX << j;
if(!invMap.getIndex( ss.str(), &tmp)) {
invMap.insert( count++, ss.str());
}
}
std::ofstream outfile(fileName.c_str());
std::string outscript;
if(!outfile) {
FATAL_ERROR( "Error: opening the omap file for output", 1);
WARNING( "Error: opening the omap file for output");
return 1;
}
for(int i=0; i<count; i++) {
outscript = "";
invMap.getValue(i,&outscript);
if(outscript.length() == 0) {
cout << "error: internal error while making .omap " << i << endl;
FATAL_ERROR("error",1);
}
outfile << outscript.c_str() << " " << i << std::endl;
}
outfile.close();
return 0;
}