blob: 3e793c905f695053b13f39fc9550d5c3378232cf [file] [log] [blame]
/*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file picowa.c
*
* word analysis PU - lexicon lookup and POS prediction
*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
* All rights reserved.
*
* History:
* - 2009-04-20 -- initial version
*
*/
#include "picoos.h"
#include "picodbg.h"
#include "picodata.h"
#include "picowa.h"
#include "picoklex.h"
#include "picokdt.h"
#include "picoktab.h"
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
/* PU waStep states */
#define WA_STEPSTATE_COLLECT 0
#define WA_STEPSTATE_PROCESS 1
#define WA_STEPSTATE_FEED 2
/* subobject : WordAnaUnit
* shortcut : wa
* context size : one item
*/
typedef struct wa_subobj {
picoos_uint8 procState; /* for next processing step decision */
/* one item only */
picoos_uint8 inBuf[PICOWA_MAXITEMSIZE]; /* internal input buffer */
picoos_uint16 inBufSize; /* actually allocated size */
picoos_uint16 inLen; /* length of item in inBuf, 0 for empty buf */
picoos_uint8 outBuf[PICOWA_MAXITEMSIZE]; /* internal output buffer */
picoos_uint16 outBufSize; /* actually allocated size */
picoos_uint16 outLen; /* length of item in outBuf, 0 for empty buf */
/* lex knowledge base */
picoklex_Lex lex;
/* ulex knowledge bases */
picoos_uint8 numUlex;
picoklex_Lex ulex[PICOKNOW_MAX_NUM_ULEX];
/* tab knowledge base */
picoktab_Pos tabpos;
/* dtposp knowledge base */
picokdt_DtPosP dtposp;
} wa_subobj_t;
static pico_status_t waInitialize(register picodata_ProcessingUnit this) {
picoos_uint8 i;
picoklex_Lex ulex;
wa_subobj_t * wa;
picoknow_kb_id_t ulexKbIds[PICOKNOW_MAX_NUM_ULEX] = PICOKNOW_KBID_ULEX_ARRAY;
PICODBG_DEBUG(("calling"));
if (NULL == this || NULL == this->subObj) {
return (picodata_step_result_t) picoos_emRaiseException(this->common->em,
PICO_ERR_NULLPTR_ACCESS, NULL, NULL);
}
wa = (wa_subobj_t *) this->subObj;
wa->procState = WA_STEPSTATE_COLLECT;
wa->inBufSize = PICOWA_MAXITEMSIZE;
wa->inLen = 0;
wa->outBufSize = PICOWA_MAXITEMSIZE;
wa->outLen = 0;
/* kb lex */
wa->lex = picoklex_getLex(this->voice->kbArray[PICOKNOW_KBID_LEX_MAIN]);
if (wa->lex == NULL) {
return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
PICODBG_DEBUG(("got lex"));
/* kb ulex[] */
wa->numUlex = 0;
for (i = 0; i<PICOKNOW_MAX_NUM_ULEX; i++) {
ulex = picoklex_getLex(this->voice->kbArray[ulexKbIds[i]]);
if (NULL != ulex) {
wa->ulex[wa->numUlex++] = ulex;
}
}
PICODBG_DEBUG(("got %i user lexica", wa->numUlex));
/* kb tabpos */
wa->tabpos =
picoktab_getPos(this->voice->kbArray[PICOKNOW_KBID_TAB_POS]);
if (wa->tabpos == NULL) {
return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
PICODBG_DEBUG(("got tabpos"));
/* kb dtposp */
wa->dtposp = picokdt_getDtPosP(this->voice->kbArray[PICOKNOW_KBID_DT_POSP]);
if (wa->dtposp == NULL) {
return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
PICODBG_DEBUG(("got dtposp"));
return PICO_OK;
}
static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
picoos_int16 mode,
picoos_uint16 *numBytesOutput);
static pico_status_t waTerminate(register picodata_ProcessingUnit this) {
return PICO_OK;
}
static pico_status_t waSubObjDeallocate(register picodata_ProcessingUnit this,
picoos_MemoryManager mm) {
if (NULL != this) {
picoos_deallocate(this->common->mm, (void *) &this->subObj);
}
mm = mm; /* avoid warning "var not used in this function"*/
return PICO_OK;
}
picodata_ProcessingUnit picowa_newWordAnaUnit(picoos_MemoryManager mm,
picoos_Common common,
picodata_CharBuffer cbIn,
picodata_CharBuffer cbOut,
picorsrc_Voice voice) {
picodata_ProcessingUnit this;
this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice);
if (this == NULL) {
return NULL;
}
this->initialize = waInitialize;
PICODBG_DEBUG(("set this->step to waStep"));
this->step = waStep;
this->terminate = waTerminate;
this->subDeallocate = waSubObjDeallocate;
this->subObj = picoos_allocate(mm, sizeof(wa_subobj_t));
if (this->subObj == NULL) {
picoos_deallocate(mm, (void *)&this);
picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
return NULL;
}
waInitialize(this);
return this;
}
/* ***********************************************************************/
/* WORDGRAPH proc functions */
/* ***********************************************************************/
static picoos_uint8 waClassifyPos(register picodata_ProcessingUnit this,
register wa_subobj_t *wa,
const picoos_uint8 *graph,
const picoos_uint16 graphlen) {
picokdt_classify_result_t dtres;
picoos_uint8 specchar;
picoos_uint16 i;
PICODBG_DEBUG(("graphlen %d", graphlen));
/* check existence of special char (e.g. hyphen) in graph:
for now, check existence of hard-coded ascii hyphen,
ie. preproc needs to match all UTF8 hyphens to the ascii
hyphen. */
/* @todo : consider specifying special char(s) in lingware. */
specchar = FALSE;
i = 0;
while ((i < graphlen) && (!specchar)) {
if (graph[i++] == '-') {
specchar = TRUE;
}
}
/* construct input vector, which is set in dtposp */
if (!picokdt_dtPosPconstructInVec(wa->dtposp, graph, graphlen, specchar)) {
/* error constructing invec */
PICODBG_WARN(("problem with invec"));
picoos_emRaiseWarning(this->common->em, PICO_WARN_INVECTOR, NULL, NULL);
return PICODATA_ITEMINFO1_ERR;
}
/* classify */
if (!picokdt_dtPosPclassify(wa->dtposp)) {
/* error doing classification */
PICODBG_WARN(("problem classifying"));
picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
NULL, NULL);
return PICODATA_ITEMINFO1_ERR;
}
/* decompose */
if (!picokdt_dtPosPdecomposeOutClass(wa->dtposp, &dtres)) {
/* error decomposing */
PICODBG_WARN(("problem decomposing"));
picoos_emRaiseWarning(this->common->em, PICO_WARN_OUTVECTOR,
NULL, NULL);
return PICODATA_ITEMINFO1_ERR;
}
if (dtres.set) {
PICODBG_DEBUG(("class %d", dtres.class));
return (picoos_uint8)dtres.class;
} else {
PICODBG_WARN(("result not set"));
picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
NULL, NULL);
return PICODATA_ITEMINFO1_ERR;
}
}
static pico_status_t waProcessWordgraph(register picodata_ProcessingUnit this,
register wa_subobj_t *wa /*inout*/,
picodata_itemhead_t *head /*inout*/,
const picoos_uint8 *content) {
pico_status_t status;
picoklex_lexl_result_t lexres;
picoos_uint8 posbuf[PICOKTAB_MAXNRPOS_IN_COMB];
picoos_uint8 i;
picoos_uint8 foundIndex;
picoos_bool found;
PICODBG_DEBUG(("type %c, len %d", head->type, head->len));
/* do lookup
if no entry found:
do POS prediction: -> WORDGRAPH(POSes,NA)graph
else:
if incl-phone:
N entries possible -> WORDINDEX(POSes,NA)POS1|ind1...POSN|indN
(N in {1,...,PICOKLEX_MAX_NRRES}, now up to 4)
else:
no phone, one entry -> WORDGRAPH(POS,NA)graph
*/
found = FALSE;
i = 0;
while (!found && (i < wa->numUlex)) {
found = picoklex_lexLookup(wa->ulex[i], content, head->len, &lexres);
i++;
}
/* note that if found, i will be incremented nevertheless, so i >= 1 */
if (found) {
foundIndex = i;
} else {
foundIndex = 0;
}
if (!found && !picoklex_lexLookup(wa->lex, content, head->len, &lexres)) {
/* no lex entry found, WORDGRAPH(POS,NA)graph */
if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
wa->outBuf, wa->outBufSize,
&wa->outLen)) {
wa->inLen = 0;
/* predict and modify pos in info1 */
if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
waClassifyPos(this, wa, content, head->len))) {
return picoos_emRaiseException(this->common->em,
PICO_EXC_BUF_OVERFLOW,NULL,NULL);
}
}
} else { /* at least one entry found */
PICODBG_DEBUG(("at least one entry found in lexicon %i",foundIndex));
if (lexres.phonfound) { /* incl. ind-phone and possibly multi-ent. */
if (lexres.nrres > PICOKLEX_MAX_NRRES) {
/* not possible with system lexicon, needs to be
ensured for user lex too */
picoos_emRaiseWarning(this->common->em, PICO_WARN_FALLBACK,NULL,
(picoos_char *)"using %d lexicon lookup results",
PICOKLEX_MAX_NRRES);
lexres.nrres = PICOKLEX_MAX_NRRES;
}
head->type = PICODATA_ITEM_WORDINDEX;
if (lexres.nrres == 1) {
head->info1 = lexres.posind[0];
} else {
/* more than one result, POSgroup info needs to be
determined for later POS disambiguation */
for (i = 0; i < lexres.nrres; i++) {
posbuf[i] = lexres.posind[i * PICOKLEX_POSIND_SIZE];
}
head->info1 = picoktab_getPosGroup(wa->tabpos, posbuf,
lexres.nrres);
}
head->info2 = foundIndex;
head->len = lexres.posindlen;
if ((status = picodata_put_itemparts(head, lexres.posind,
lexres.posindlen,
wa->outBuf, wa->outBufSize,
&wa->outLen)) == PICO_OK) {
wa->inLen = 0;
} else {
return picoos_emRaiseException(this->common->em, status,
NULL, NULL);
}
} else { /* no phone, :G2P, one entry: WORDGRAPH(POS,NA)graph */
if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
wa->outBuf, wa->outBufSize,
&wa->outLen)) {
wa->inLen = 0;
/* set lex pos in info1 */
if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
lexres.posind[0])) {
return picoos_emRaiseException(this->common->em,
PICO_EXC_BUF_OVERFLOW,
NULL, NULL);
}
}
}
}
return PICO_OK;
}
/* ***********************************************************************/
/* waStep function */
/* ***********************************************************************/
/*
collect into internal buffer, process, and then feed to output buffer
init state: COLLECT ext ext
state transitions: in IN OUTout
COLLECT | getOneItem ->-1 +1 0 0 | (ATOMIC) -> PROCESS (got item)
COLLECT | getOneItem -> 0 0 0 0 | IDLE (got no item)
PROCESS | procOneItem -> 0 -1 +1 0 | (ATOMIC) -> FEED (proc'ed item)
PROCESS | procOneItem -> 0 -1 0 0 | BUSY -> COLLECT (item skipped)
FEED | putOneItem -> 0 0 -1 +1 | BUSY -> COLLECT (put item)
FEED | putOneItem -> 0 0 1 0 | OUT_FULL (put no item)
*/
static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
picoos_int16 mode,
picoos_uint16 * numBytesOutput) {
register wa_subobj_t *wa;
pico_status_t rv = PICO_OK;
if (NULL == this || NULL == this->subObj) {
return PICODATA_PU_ERROR;
}
wa = (wa_subobj_t *) this->subObj;
mode = mode; /* avoid warning "var not used in this function"*/
*numBytesOutput = 0;
while (1) { /* exit via return */
PICODBG_DEBUG(("doing state %i, inLen: %d, outLen: %d",
wa->procState, wa->inLen, wa->outLen));
switch (wa->procState) {
/* collect state: get item from charBuf and store in
* internal inBuf
*/
case WA_STEPSTATE_COLLECT:
if (wa->inLen == 0) { /* is input buffer empty? */
picoos_uint16 blen;
/* try to get one item */
rv = picodata_cbGetItem(this->cbIn, wa->inBuf,
wa->inBufSize, &blen);
PICODBG_DEBUG(("after getting item, status: %d", rv));
if (PICO_OK == rv) {
/* we now have one item */
wa->inLen = blen;
wa->procState = WA_STEPSTATE_PROCESS;
/* uncomment next line to split into two steps */
/* return PICODATA_PU_ATOMIC; */
} else if (PICO_EOF == rv) {
/* there was no item in the char buffer */
return PICODATA_PU_IDLE;
} else if ((PICO_EXC_BUF_UNDERFLOW == rv)
|| (PICO_EXC_BUF_OVERFLOW == rv)) {
PICODBG_ERROR(("problem getting item"));
picoos_emRaiseException(this->common->em, rv,
NULL, NULL);
return PICODATA_PU_ERROR;
} else {
PICODBG_ERROR(("problem getting item, unhandled"));
picoos_emRaiseException(this->common->em, rv,
NULL, NULL);
return PICODATA_PU_ERROR;
}
} else { /* there already is an item in the input buffer */
PICODBG_WARN(("item already in input buffer"));
picoos_emRaiseWarning(this->common->em,
PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
wa->procState = WA_STEPSTATE_PROCESS;
/* uncomment next to split into two steps */
/* return PICODATA_PU_ATOMIC; */
}
break;
/* process state: process item in internal inBuf and put
* result in internal outBuf
*/
case WA_STEPSTATE_PROCESS:
/* ensure there is an item in inBuf and it is valid */
if ((wa->inLen > 0) && picodata_is_valid_item(wa->inBuf,
wa->inLen)) {
picodata_itemhead_t ihead;
picoos_uint8 *icontent;
pico_status_t rvP = PICO_OK;
rv = picodata_get_iteminfo(wa->inBuf, wa->inLen, &ihead,
&icontent);
if (PICO_OK == rv) {
switch (ihead.type) {
case PICODATA_ITEM_WORDGRAPH:
if (0 < ihead.len) {
rvP = waProcessWordgraph(this, wa, &ihead,
icontent);
} else {
/* else ignore empty WORDGRAPH */
wa->inLen = 0;
wa->procState = WA_STEPSTATE_COLLECT;
return PICODATA_PU_BUSY;
}
break;
case PICODATA_ITEM_OTHER:
/* skip item */
rvP = PICO_WARN_PU_DISCARD_BUF;
break;
default:
/* copy item unmodified */
rvP = picodata_copy_item(wa->inBuf,
wa->inLen, wa->outBuf,
wa->outBufSize, &wa->outLen);
break;
}
if (PICO_OK == rvP) {
wa->inLen = 0;
wa->procState = WA_STEPSTATE_FEED;
/* uncomment next to split into two steps */
/* return PICODATA_PU_ATOMIC; */
} else if (PICO_WARN_PU_DISCARD_BUF == rvP) {
/* discard input buffer and get a new item */
PICODBG_INFO(("skipping OTHER item"));
/* picoos_emRaiseWarning(this->common->em,
PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
*/
wa->inLen = 0;
wa->procState = WA_STEPSTATE_COLLECT;
return PICODATA_PU_BUSY;
} else {
/* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf
PICO_ERR_OTHER <- no valid item in inbuf
or return from processWordgraph
*/
PICODBG_ERROR(("problem processing item", rvP));
picoos_emRaiseException(this->common->em, rvP,
NULL, NULL);
return PICODATA_PU_ERROR;
}
} else { /* could not get iteminfo */
/* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf
PICO_ERR_OTHER <- no valid item in inbuf
*/
PICODBG_ERROR(("problem getting item info, "
"discard buffer content"));
wa->inLen = 0;
wa->procState = WA_STEPSTATE_COLLECT;
picoos_emRaiseException(this->common->em, rv,
NULL, NULL);
return PICODATA_PU_ERROR;
}
} else if (wa->inLen == 0) { /* no item in inBuf */
PICODBG_INFO(("no item in inBuf"));
/* wa->inLen = 0;*/
wa->procState = WA_STEPSTATE_COLLECT;
return PICODATA_PU_BUSY;
} else { /* no valid item in inBuf */
/* bad state/item, discard buffer content */
PICODBG_WARN(("no valid item, discard buffer content"));
picoos_emRaiseWarning(this->common->em,
PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
picoos_emRaiseWarning(this->common->em,
PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
wa->inLen = 0;
wa->procState = WA_STEPSTATE_COLLECT;
return PICODATA_PU_BUSY;
}
break;
/* feed state: copy item in internal outBuf to output charBuf */
case WA_STEPSTATE_FEED:
/* check that item fits in cb should not be needed */
rv = picodata_cbPutItem(this->cbOut, wa->outBuf,
wa->outLen, numBytesOutput);
PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG],
(picoos_uint8 *)"wana: ", wa->outBuf,
wa->outLen);
PICODBG_DEBUG(("put item, status: %d", rv));
if (PICO_OK == rv) {
wa->outLen = 0;
wa->procState = WA_STEPSTATE_COLLECT;
return PICODATA_PU_BUSY;
} else if (PICO_EXC_BUF_OVERFLOW == rv) {
PICODBG_INFO(("feeding, overflow, PICODATA_PU_OUT_FULL"));
return PICODATA_PU_OUT_FULL;
} else if ((PICO_EXC_BUF_UNDERFLOW == rv)
|| (PICO_ERR_OTHER == rv)) {
PICODBG_WARN(("feeding problem, discarding item"));
wa->outLen = 0;
wa->procState = WA_STEPSTATE_COLLECT;
picoos_emRaiseWarning(this->common->em, rv, NULL,NULL);
return PICODATA_PU_BUSY;
}
break;
default:
break;
} /* switch */
} /* while */
/* should be never reached */
PICODBG_ERROR(("reached end of function"));
picoos_emRaiseException(this->common->em, PICO_ERR_OTHER, NULL, NULL);
return PICODATA_PU_ERROR;
}
#ifdef __cplusplus
}
#endif
/* end */