| /* |
| * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /** |
| * @file picowa.c |
| * |
| * word analysis PU - lexicon lookup and POS prediction |
| * |
| * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland |
| * All rights reserved. |
| * |
| * History: |
| * - 2009-04-20 -- initial version |
| * |
| */ |
| |
| #include "picoos.h" |
| #include "picodbg.h" |
| #include "picodata.h" |
| #include "picowa.h" |
| #include "picoklex.h" |
| #include "picokdt.h" |
| #include "picoktab.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| #if 0 |
| } |
| #endif |
| |
| /* PU waStep states */ |
| #define WA_STEPSTATE_COLLECT 0 |
| #define WA_STEPSTATE_PROCESS 1 |
| #define WA_STEPSTATE_FEED 2 |
| |
| |
| /* subobject : WordAnaUnit |
| * shortcut : wa |
| * context size : one item |
| */ |
| typedef struct wa_subobj { |
| picoos_uint8 procState; /* for next processing step decision */ |
| |
| /* one item only */ |
| picoos_uint8 inBuf[PICOWA_MAXITEMSIZE]; /* internal input buffer */ |
| picoos_uint16 inBufSize; /* actually allocated size */ |
| picoos_uint16 inLen; /* length of item in inBuf, 0 for empty buf */ |
| |
| picoos_uint8 outBuf[PICOWA_MAXITEMSIZE]; /* internal output buffer */ |
| picoos_uint16 outBufSize; /* actually allocated size */ |
| picoos_uint16 outLen; /* length of item in outBuf, 0 for empty buf */ |
| |
| /* lex knowledge base */ |
| picoklex_Lex lex; |
| |
| /* ulex knowledge bases */ |
| picoos_uint8 numUlex; |
| picoklex_Lex ulex[PICOKNOW_MAX_NUM_ULEX]; |
| |
| /* tab knowledge base */ |
| picoktab_Pos tabpos; |
| |
| /* dtposp knowledge base */ |
| picokdt_DtPosP dtposp; |
| } wa_subobj_t; |
| |
| |
| static pico_status_t waInitialize(register picodata_ProcessingUnit this, picoos_int32 r_mode) { |
| picoos_uint8 i; |
| picoklex_Lex ulex; |
| wa_subobj_t * wa; |
| |
| picoknow_kb_id_t ulexKbIds[PICOKNOW_MAX_NUM_ULEX] = PICOKNOW_KBID_ULEX_ARRAY; |
| |
| PICODBG_DEBUG(("calling")); |
| |
| if (NULL == this || NULL == this->subObj) { |
| return (picodata_step_result_t) picoos_emRaiseException(this->common->em, |
| PICO_ERR_NULLPTR_ACCESS, NULL, NULL); |
| } |
| wa = (wa_subobj_t *) this->subObj; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| wa->inBufSize = PICOWA_MAXITEMSIZE; |
| wa->inLen = 0; |
| wa->outBufSize = PICOWA_MAXITEMSIZE; |
| wa->outLen = 0; |
| |
| if (r_mode == PICO_RESET_SOFT) { |
| /*following initializations needed only at startup or after a full reset*/ |
| return PICO_OK; |
| } |
| /* kb lex */ |
| wa->lex = picoklex_getLex(this->voice->kbArray[PICOKNOW_KBID_LEX_MAIN]); |
| if (wa->lex == NULL) { |
| return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| PICODBG_DEBUG(("got lex")); |
| |
| /* kb ulex[] */ |
| wa->numUlex = 0; |
| for (i = 0; i<PICOKNOW_MAX_NUM_ULEX; i++) { |
| ulex = picoklex_getLex(this->voice->kbArray[ulexKbIds[i]]); |
| if (NULL != ulex) { |
| wa->ulex[wa->numUlex++] = ulex; |
| } |
| } |
| PICODBG_DEBUG(("got %i user lexica", wa->numUlex)); |
| |
| /* kb tabpos */ |
| wa->tabpos = |
| picoktab_getPos(this->voice->kbArray[PICOKNOW_KBID_TAB_POS]); |
| if (wa->tabpos == NULL) { |
| return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| PICODBG_DEBUG(("got tabpos")); |
| |
| /* kb dtposp */ |
| wa->dtposp = picokdt_getDtPosP(this->voice->kbArray[PICOKNOW_KBID_DT_POSP]); |
| if (wa->dtposp == NULL) { |
| return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| PICODBG_DEBUG(("got dtposp")); |
| return PICO_OK; |
| } |
| |
| static picodata_step_result_t waStep(register picodata_ProcessingUnit this, |
| picoos_int16 mode, |
| picoos_uint16 *numBytesOutput); |
| |
| static pico_status_t waTerminate(register picodata_ProcessingUnit this) { |
| return PICO_OK; |
| } |
| |
| static pico_status_t waSubObjDeallocate(register picodata_ProcessingUnit this, |
| picoos_MemoryManager mm) { |
| if (NULL != this) { |
| picoos_deallocate(this->common->mm, (void *) &this->subObj); |
| } |
| mm = mm; /* avoid warning "var not used in this function"*/ |
| return PICO_OK; |
| } |
| |
| |
| picodata_ProcessingUnit picowa_newWordAnaUnit(picoos_MemoryManager mm, |
| picoos_Common common, |
| picodata_CharBuffer cbIn, |
| picodata_CharBuffer cbOut, |
| picorsrc_Voice voice) { |
| picodata_ProcessingUnit this; |
| |
| this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice); |
| if (this == NULL) { |
| return NULL; |
| } |
| |
| this->initialize = waInitialize; |
| PICODBG_DEBUG(("set this->step to waStep")); |
| this->step = waStep; |
| this->terminate = waTerminate; |
| this->subDeallocate = waSubObjDeallocate; |
| this->subObj = picoos_allocate(mm, sizeof(wa_subobj_t)); |
| if (this->subObj == NULL) { |
| picoos_deallocate(mm, (void *)&this); |
| picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); |
| return NULL; |
| } |
| |
| waInitialize(this, PICO_RESET_FULL); |
| return this; |
| } |
| |
| /* ***********************************************************************/ |
| /* WORDGRAPH proc functions */ |
| /* ***********************************************************************/ |
| |
| static picoos_uint8 waClassifyPos(register picodata_ProcessingUnit this, |
| register wa_subobj_t *wa, |
| const picoos_uint8 *graph, |
| const picoos_uint16 graphlen) { |
| picokdt_classify_result_t dtres; |
| picoos_uint8 specchar; |
| picoos_uint16 i; |
| |
| PICODBG_DEBUG(("graphlen %d", graphlen)); |
| |
| /* check existence of special char (e.g. hyphen) in graph: |
| for now, check existence of hard-coded ascii hyphen, |
| ie. preproc needs to match all UTF8 hyphens to the ascii |
| hyphen. */ |
| /* @todo : consider specifying special char(s) in lingware. */ |
| specchar = FALSE; |
| i = 0; |
| while ((i < graphlen) && (!specchar)) { |
| if (graph[i++] == '-') { |
| specchar = TRUE; |
| } |
| } |
| |
| /* construct input vector, which is set in dtposp */ |
| if (!picokdt_dtPosPconstructInVec(wa->dtposp, graph, graphlen, specchar)) { |
| /* error constructing invec */ |
| PICODBG_WARN(("problem with invec")); |
| picoos_emRaiseWarning(this->common->em, PICO_WARN_INVECTOR, NULL, NULL); |
| return PICODATA_ITEMINFO1_ERR; |
| } |
| |
| /* classify */ |
| if (!picokdt_dtPosPclassify(wa->dtposp)) { |
| /* error doing classification */ |
| PICODBG_WARN(("problem classifying")); |
| picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION, |
| NULL, NULL); |
| return PICODATA_ITEMINFO1_ERR; |
| } |
| |
| /* decompose */ |
| if (!picokdt_dtPosPdecomposeOutClass(wa->dtposp, &dtres)) { |
| /* error decomposing */ |
| PICODBG_WARN(("problem decomposing")); |
| picoos_emRaiseWarning(this->common->em, PICO_WARN_OUTVECTOR, |
| NULL, NULL); |
| return PICODATA_ITEMINFO1_ERR; |
| } |
| |
| if (dtres.set) { |
| PICODBG_DEBUG(("class %d", dtres.class)); |
| return (picoos_uint8)dtres.class; |
| } else { |
| PICODBG_WARN(("result not set")); |
| picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION, |
| NULL, NULL); |
| return PICODATA_ITEMINFO1_ERR; |
| } |
| } |
| |
| |
| static pico_status_t waProcessWordgraph(register picodata_ProcessingUnit this, |
| register wa_subobj_t *wa /*inout*/, |
| picodata_itemhead_t *head /*inout*/, |
| const picoos_uint8 *content) { |
| pico_status_t status; |
| picoklex_lexl_result_t lexres; |
| picoos_uint8 posbuf[PICOKTAB_MAXNRPOS_IN_COMB]; |
| picoos_uint8 i; |
| picoos_uint8 foundIndex; |
| picoos_bool found; |
| |
| |
| PICODBG_DEBUG(("type %c, len %d", head->type, head->len)); |
| |
| /* do lookup |
| if no entry found: |
| do POS prediction: -> WORDGRAPH(POSes,NA)graph |
| else: |
| if incl-phone: |
| N entries possible -> WORDINDEX(POSes,NA)POS1|ind1...POSN|indN |
| (N in {1,...,PICOKLEX_MAX_NRRES}, now up to 4) |
| else: |
| no phone, one entry -> WORDGRAPH(POS,NA)graph |
| */ |
| |
| found = FALSE; |
| i = 0; |
| while (!found && (i < wa->numUlex)) { |
| found = picoklex_lexLookup(wa->ulex[i], content, head->len, &lexres); |
| i++; |
| } |
| /* note that if found, i will be incremented nevertheless, so i >= 1 */ |
| if (found) { |
| foundIndex = i; |
| } else { |
| foundIndex = 0; |
| } |
| if (!found && !picoklex_lexLookup(wa->lex, content, head->len, &lexres)) { |
| /* no lex entry found, WORDGRAPH(POS,NA)graph */ |
| if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen, |
| wa->outBuf, wa->outBufSize, |
| &wa->outLen)) { |
| wa->inLen = 0; |
| /* predict and modify pos in info1 */ |
| if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen, |
| waClassifyPos(this, wa, content, head->len))) { |
| return picoos_emRaiseException(this->common->em, |
| PICO_EXC_BUF_OVERFLOW,NULL,NULL); |
| } |
| } |
| |
| } else { /* at least one entry found */ |
| PICODBG_DEBUG(("at least one entry found in lexicon %i",foundIndex)); |
| if (lexres.phonfound) { /* incl. ind-phone and possibly multi-ent. */ |
| if (lexres.nrres > PICOKLEX_MAX_NRRES) { |
| /* not possible with system lexicon, needs to be |
| ensured for user lex too */ |
| picoos_emRaiseWarning(this->common->em, PICO_WARN_FALLBACK,NULL, |
| (picoos_char *)"using %d lexicon lookup results", |
| PICOKLEX_MAX_NRRES); |
| lexres.nrres = PICOKLEX_MAX_NRRES; |
| } |
| head->type = PICODATA_ITEM_WORDINDEX; |
| if (lexres.nrres == 1) { |
| head->info1 = lexres.posind[0]; |
| } else { |
| /* more than one result, POSgroup info needs to be |
| determined for later POS disambiguation */ |
| for (i = 0; i < lexres.nrres; i++) { |
| posbuf[i] = lexres.posind[i * PICOKLEX_POSIND_SIZE]; |
| } |
| head->info1 = picoktab_getPosGroup(wa->tabpos, posbuf, |
| lexres.nrres); |
| } |
| head->info2 = foundIndex; |
| head->len = lexres.posindlen; |
| if ((status = picodata_put_itemparts(head, lexres.posind, |
| lexres.posindlen, |
| wa->outBuf, wa->outBufSize, |
| &wa->outLen)) == PICO_OK) { |
| wa->inLen = 0; |
| } else { |
| return picoos_emRaiseException(this->common->em, status, |
| NULL, NULL); |
| } |
| |
| } else { /* no phone, :G2P, one entry: WORDGRAPH(POS,NA)graph */ |
| if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen, |
| wa->outBuf, wa->outBufSize, |
| &wa->outLen)) { |
| wa->inLen = 0; |
| /* set lex pos in info1 */ |
| if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen, |
| lexres.posind[0])) { |
| return picoos_emRaiseException(this->common->em, |
| PICO_EXC_BUF_OVERFLOW, |
| NULL, NULL); |
| } |
| } |
| } |
| } |
| return PICO_OK; |
| } |
| |
| |
| /* ***********************************************************************/ |
| /* waStep function */ |
| /* ***********************************************************************/ |
| |
| /* |
| collect into internal buffer, process, and then feed to output buffer |
| |
| init state: COLLECT ext ext |
| state transitions: in IN OUTout |
| COLLECT | getOneItem ->-1 +1 0 0 | (ATOMIC) -> PROCESS (got item) |
| COLLECT | getOneItem -> 0 0 0 0 | IDLE (got no item) |
| |
| PROCESS | procOneItem -> 0 -1 +1 0 | (ATOMIC) -> FEED (proc'ed item) |
| PROCESS | procOneItem -> 0 -1 0 0 | BUSY -> COLLECT (item skipped) |
| |
| FEED | putOneItem -> 0 0 -1 +1 | BUSY -> COLLECT (put item) |
| FEED | putOneItem -> 0 0 1 0 | OUT_FULL (put no item) |
| */ |
| |
| static picodata_step_result_t waStep(register picodata_ProcessingUnit this, |
| picoos_int16 mode, |
| picoos_uint16 * numBytesOutput) { |
| register wa_subobj_t *wa; |
| pico_status_t rv = PICO_OK; |
| |
| if (NULL == this || NULL == this->subObj) { |
| return PICODATA_PU_ERROR; |
| } |
| wa = (wa_subobj_t *) this->subObj; |
| mode = mode; /* avoid warning "var not used in this function"*/ |
| *numBytesOutput = 0; |
| while (1) { /* exit via return */ |
| PICODBG_DEBUG(("doing state %i, inLen: %d, outLen: %d", |
| wa->procState, wa->inLen, wa->outLen)); |
| |
| switch (wa->procState) { |
| /* collect state: get item from charBuf and store in |
| * internal inBuf |
| */ |
| case WA_STEPSTATE_COLLECT: |
| if (wa->inLen == 0) { /* is input buffer empty? */ |
| picoos_uint16 blen; |
| /* try to get one item */ |
| rv = picodata_cbGetItem(this->cbIn, wa->inBuf, |
| wa->inBufSize, &blen); |
| PICODBG_DEBUG(("after getting item, status: %d", rv)); |
| if (PICO_OK == rv) { |
| /* we now have one item */ |
| wa->inLen = blen; |
| wa->procState = WA_STEPSTATE_PROCESS; |
| /* uncomment next line to split into two steps */ |
| /* return PICODATA_PU_ATOMIC; */ |
| } else if (PICO_EOF == rv) { |
| /* there was no item in the char buffer */ |
| return PICODATA_PU_IDLE; |
| } else if ((PICO_EXC_BUF_UNDERFLOW == rv) |
| || (PICO_EXC_BUF_OVERFLOW == rv)) { |
| PICODBG_ERROR(("problem getting item")); |
| picoos_emRaiseException(this->common->em, rv, |
| NULL, NULL); |
| return PICODATA_PU_ERROR; |
| } else { |
| PICODBG_ERROR(("problem getting item, unhandled")); |
| picoos_emRaiseException(this->common->em, rv, |
| NULL, NULL); |
| return PICODATA_PU_ERROR; |
| } |
| } else { /* there already is an item in the input buffer */ |
| PICODBG_WARN(("item already in input buffer")); |
| picoos_emRaiseWarning(this->common->em, |
| PICO_WARN_PU_IRREG_ITEM, NULL, NULL); |
| wa->procState = WA_STEPSTATE_PROCESS; |
| /* uncomment next to split into two steps */ |
| /* return PICODATA_PU_ATOMIC; */ |
| } |
| break; |
| |
| |
| /* process state: process item in internal inBuf and put |
| * result in internal outBuf |
| */ |
| case WA_STEPSTATE_PROCESS: |
| |
| /* ensure there is an item in inBuf and it is valid */ |
| if ((wa->inLen > 0) && picodata_is_valid_item(wa->inBuf, |
| wa->inLen)) { |
| picodata_itemhead_t ihead; |
| picoos_uint8 *icontent; |
| pico_status_t rvP = PICO_OK; |
| |
| rv = picodata_get_iteminfo(wa->inBuf, wa->inLen, &ihead, |
| &icontent); |
| if (PICO_OK == rv) { |
| |
| switch (ihead.type) { |
| case PICODATA_ITEM_WORDGRAPH: |
| |
| if (0 < ihead.len) { |
| rvP = waProcessWordgraph(this, wa, &ihead, |
| icontent); |
| } else { |
| /* else ignore empty WORDGRAPH */ |
| wa->inLen = 0; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| return PICODATA_PU_BUSY; |
| } |
| break; |
| case PICODATA_ITEM_OTHER: |
| /* skip item */ |
| rvP = PICO_WARN_PU_DISCARD_BUF; |
| break; |
| default: |
| /* copy item unmodified */ |
| rvP = picodata_copy_item(wa->inBuf, |
| wa->inLen, wa->outBuf, |
| wa->outBufSize, &wa->outLen); |
| break; |
| } |
| |
| if (PICO_OK == rvP) { |
| wa->inLen = 0; |
| wa->procState = WA_STEPSTATE_FEED; |
| /* uncomment next to split into two steps */ |
| /* return PICODATA_PU_ATOMIC; */ |
| } else if (PICO_WARN_PU_DISCARD_BUF == rvP) { |
| /* discard input buffer and get a new item */ |
| PICODBG_INFO(("skipping OTHER item")); |
| /* picoos_emRaiseWarning(this->common->em, |
| PICO_WARN_PU_DISCARD_BUF, NULL, NULL); |
| */ |
| wa->inLen = 0; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| return PICODATA_PU_BUSY; |
| } else { |
| /* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf |
| PICO_ERR_OTHER <- no valid item in inbuf |
| or return from processWordgraph |
| */ |
| PICODBG_ERROR(("problem processing item", rvP)); |
| picoos_emRaiseException(this->common->em, rvP, |
| NULL, NULL); |
| return PICODATA_PU_ERROR; |
| } |
| |
| } else { /* could not get iteminfo */ |
| /* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf |
| PICO_ERR_OTHER <- no valid item in inbuf |
| */ |
| PICODBG_ERROR(("problem getting item info, " |
| "discard buffer content")); |
| wa->inLen = 0; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| picoos_emRaiseException(this->common->em, rv, |
| NULL, NULL); |
| return PICODATA_PU_ERROR; |
| } |
| |
| } else if (wa->inLen == 0) { /* no item in inBuf */ |
| PICODBG_INFO(("no item in inBuf")); |
| /* wa->inLen = 0;*/ |
| wa->procState = WA_STEPSTATE_COLLECT; |
| return PICODATA_PU_BUSY; |
| |
| } else { /* no valid item in inBuf */ |
| /* bad state/item, discard buffer content */ |
| PICODBG_WARN(("no valid item, discard buffer content")); |
| picoos_emRaiseWarning(this->common->em, |
| PICO_WARN_PU_IRREG_ITEM, NULL, NULL); |
| picoos_emRaiseWarning(this->common->em, |
| PICO_WARN_PU_DISCARD_BUF, NULL, NULL); |
| wa->inLen = 0; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| return PICODATA_PU_BUSY; |
| } |
| break; |
| |
| |
| /* feed state: copy item in internal outBuf to output charBuf */ |
| case WA_STEPSTATE_FEED: |
| |
| /* check that item fits in cb should not be needed */ |
| rv = picodata_cbPutItem(this->cbOut, wa->outBuf, |
| wa->outLen, numBytesOutput); |
| |
| PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG], |
| (picoos_uint8 *)"wana: ", wa->outBuf, |
| wa->outLen); |
| |
| PICODBG_DEBUG(("put item, status: %d", rv)); |
| if (PICO_OK == rv) { |
| wa->outLen = 0; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| return PICODATA_PU_BUSY; |
| } else if (PICO_EXC_BUF_OVERFLOW == rv) { |
| PICODBG_INFO(("feeding, overflow, PICODATA_PU_OUT_FULL")); |
| return PICODATA_PU_OUT_FULL; |
| } else if ((PICO_EXC_BUF_UNDERFLOW == rv) |
| || (PICO_ERR_OTHER == rv)) { |
| PICODBG_WARN(("feeding problem, discarding item")); |
| wa->outLen = 0; |
| wa->procState = WA_STEPSTATE_COLLECT; |
| picoos_emRaiseWarning(this->common->em, rv, NULL,NULL); |
| return PICODATA_PU_BUSY; |
| } |
| break; |
| |
| default: |
| break; |
| |
| } /* switch */ |
| |
| } /* while */ |
| |
| /* should be never reached */ |
| PICODBG_ERROR(("reached end of function")); |
| picoos_emRaiseException(this->common->em, PICO_ERR_OTHER, NULL, NULL); |
| return PICODATA_PU_ERROR; |
| } |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| |
| /* end */ |