blob: 560709c83679651b92f81db10e47c51520314938 [file] [log] [blame]
/*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file picokfst.c
*
* FST knowledge loading and access
*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
* All rights reserved.
*
* History:
* - 2009-04-20 -- initial version
*
*/
#include "picoos.h"
#include "picodbg.h"
#include "picoknow.h"
#include "picokfst.h"
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
#define FileHdrSize 4 /* size of FST file header */
/* ************************************************************/
/* function to create specialized kb, */
/* to be used by picorsrc only */
/* ************************************************************/
/** object : FSTKnowledgeBase
* shortcut : kfst
* derived from : picoknow_KnowledgeBase
*/
typedef struct kfst_subobj * kfst_SubObj;
typedef struct kfst_subobj{
picoos_uint8 * fstStream; /* the byte stream base address */
picoos_int32 hdrLen; /* length of file header */
picoos_int32 transductionMode; /* transduction mode to be used for FST */
picoos_int32 nrClasses; /* nr of pair/transition classes in FST; class is in [1..nrClasses] */
picoos_int32 nrStates; /* nr of states in FST; state is in [1..nrState] */
picoos_int32 termClass; /* pair class of terminator symbol pair; probably obsolete */
picoos_int32 alphaHashTabSize; /* size of pair alphabet hash table */
picoos_int32 alphaHashTabPos; /* absolute address of the start of the pair alphabet */
picoos_int32 transTabEntrySize; /* size in bytes of each transition table entry */
picoos_int32 transTabPos; /* absolute address of the start of the transition table */
picoos_int32 inEpsStateTabPos; /* absolute address of the start of the input epsilon transition table */
picoos_int32 accStateTabPos; /* absolute address of the table of accepting states */
} kfst_subobj_t;
/* ************************************************************/
/* primitives for reading from byte stream */
/* ************************************************************/
/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into unsigned number 'num'.
'*pos' is modified to the position right after the number */
static void FixedBytesToUnsignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_uint32 * num)
{
picoos_int32 i;
(*num) = 0;
for (i = 0; i < nrBytes; i++) {
(*num) = ((*num) << 8) + (picoos_uint32)stream[*pos];
(*pos)++;
}
}
/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into signed number 'num'.
'*pos' is modified to the position right after the number */
static void FixedBytesToSignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_int32 * num)
{
picoos_int32 i;
picoos_uint32 val;
val = 0;
for (i = 0; i < nrBytes; i++) {
val = (val << 8) + (picoos_uint32)stream[*pos];
(*pos)++;
}
if (val % 2 == 1) {
/* negative number */
(*num) = -((picoos_int32)((val - 1) / 2)) - 1;
} else {
/* positive number */
(*num) = val / 2;
}
}
/* Converts varying-sized sequence of bytes starting at position '*pos' in byte stream 'stream'
into (signed) number 'num'. '*pos' is modified to the position right after the number. */
static void BytesToNum (picoos_uint8 * stream, picoos_uint32 * pos, picoos_int32 * num)
{
picoos_uint32 val;
picoos_uint32 b;
val = 0;
b = (picoos_uint32)stream[*pos];
(*pos)++;
while (b < 128) {
val = (val << 7) + b;
b = (picoos_uint32)stream[*pos];
(*pos)++;
}
val = (val << 7) + (b - 128);
if (val % 2 == 1) {
/* negative number */
(*num) = -((picoos_int32)((val - 1) / 2)) - 1;
} else {
/* positive number */
(*num) = val / 2;
}
}
/* ************************************************************/
/* setting up FST from byte stream */
/* ************************************************************/
static pico_status_t kfstInitialize(register picoknow_KnowledgeBase this,
picoos_Common common)
{
picoos_uint32 curpos;
picoos_int32 offs;
kfst_subobj_t * kfst;
PICODBG_DEBUG(("kfstInitialize -- start\n"));
if (NULL == this || NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL,
NULL);
}
kfst = (kfst_subobj_t *) this->subObj;
/* +CT+ */
kfst->fstStream = this->base;
PICODBG_TRACE(("base: %d\n",this->base));
kfst->hdrLen = FileHdrSize;
curpos = kfst->hdrLen;
BytesToNum(kfst->fstStream,& curpos,& kfst->transductionMode);
BytesToNum(kfst->fstStream,& curpos,& kfst->nrClasses);
BytesToNum(kfst->fstStream,& curpos,& kfst->nrStates);
BytesToNum(kfst->fstStream,& curpos,& kfst->termClass);
BytesToNum(kfst->fstStream,& curpos,& kfst->alphaHashTabSize);
BytesToNum(kfst->fstStream,& curpos,& offs);
kfst->alphaHashTabPos = kfst->hdrLen + offs;
BytesToNum(kfst->fstStream,& curpos,& kfst->transTabEntrySize);
BytesToNum(kfst->fstStream,& curpos,& offs);
kfst->transTabPos = kfst->hdrLen + offs;
BytesToNum(kfst->fstStream,& curpos,& offs);
kfst->inEpsStateTabPos = kfst->hdrLen + offs;
BytesToNum(kfst->fstStream,& curpos,& offs);
kfst->accStateTabPos = kfst->hdrLen + offs;
/* -CT- */
return PICO_OK;
}
static pico_status_t kfstSubObjDeallocate(register picoknow_KnowledgeBase this,
picoos_MemoryManager mm)
{
if (NULL != this) {
picoos_deallocate(mm, (void *) &this->subObj);
}
return PICO_OK;
}
/* calculates a small number of data (e.g. addresses) from kb for fast access.
* This data is encapsulated in a picokfst_FST that can later be retrieved
* with picokfst_getFST. */
pico_status_t picokfst_specializeFSTKnowledgeBase(picoknow_KnowledgeBase this,
picoos_Common common)
{
pico_status_t status;
if (NULL == this) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL);
}
if (0 < this->size) {
/* not a dummy kb */
this->subDeallocate = kfstSubObjDeallocate;
this->subObj = picoos_allocate(common->mm, sizeof(kfst_subobj_t));
if (NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
}
status = kfstInitialize(this, common);
if (PICO_OK != status) {
picoos_deallocate(common->mm,(void **)&this->subObj);
}
}
return PICO_OK;
}
/* ************************************************************/
/* FST type and getFST function */
/* ************************************************************/
/* return kb FST for usage in PU */
picokfst_FST picokfst_getFST(picoknow_KnowledgeBase this)
{
if (NULL == this) {
return NULL;
} else {
return (picokfst_FST) this->subObj;
}
}
/* ************************************************************/
/* FST access methods */
/* ************************************************************/
/* see description in header file */
extern picoos_uint8 picokfst_kfstGetTransductionMode(picokfst_FST this)
{
kfst_SubObj fst = (kfst_SubObj) this;
if (fst != NULL) {
return fst->transductionMode;
} else {
return 0;
}
}
/* see description in header file */
extern void picokfst_kfstGetFSTSizes (picokfst_FST this, picoos_int32 *nrStates, picoos_int32 *nrClasses)
{
kfst_SubObj fst = (kfst_SubObj) this;
if (fst != NULL) {
*nrStates = fst->nrStates;
*nrClasses = fst->nrClasses;
} else {
*nrStates = 0;
*nrClasses = 0;
}
}
/* see description in header file */
extern void picokfst_kfstStartPairSearch (picokfst_FST this, picokfst_symid_t inSym,
picoos_bool * inSymFound, picoos_int32 * searchState)
{
picoos_uint32 pos;
picoos_int32 offs;
picoos_int32 h;
picoos_int32 inSymCellPos;
picoos_int32 inSymX;
picoos_int32 nextSameHashInSymOffs;
kfst_SubObj fst = (kfst_SubObj) this;
(*searchState) = -1;
(*inSymFound) = 0;
h = inSym % fst->alphaHashTabSize;
pos = fst->alphaHashTabPos + (h * 4);
FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
if (offs > 0) {
inSymCellPos = fst->alphaHashTabPos + offs;
pos = inSymCellPos;
BytesToNum(fst->fstStream,& pos,& inSymX);
BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
while ((inSymX != inSym) && (nextSameHashInSymOffs > 0)) {
inSymCellPos = inSymCellPos + nextSameHashInSymOffs;
pos = inSymCellPos;
BytesToNum(fst->fstStream,& pos,& inSymX);
BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
}
if (inSymX == inSym) {
/* input symbol found; state is set to position after symbol cell */
(*searchState) = pos;
(*inSymFound) = 1;
}
}
}
/* see description in header file */
extern void picokfst_kfstGetNextPair (picokfst_FST this, picoos_int32 * searchState,
picoos_bool * pairFound,
picokfst_symid_t * outSym, picokfst_class_t * pairClass)
{
picoos_uint32 pos;
picoos_int32 val;
kfst_SubObj fst = (kfst_SubObj) this;
if ((*searchState) < 0) {
(*pairFound) = 0;
(*outSym) = PICOKFST_SYMID_ILLEG;
(*pairClass) = -1;
} else {
pos = (*searchState);
BytesToNum(fst->fstStream,& pos,& val);
*outSym = (picokfst_symid_t)val;
if ((*outSym) != PICOKFST_SYMID_ILLEG) {
BytesToNum(fst->fstStream,& pos,& val);
*pairClass = (picokfst_class_t)val;
(*pairFound) = 1;
(*searchState) = pos;
} else {
(*pairFound) = 0;
(*outSym) = PICOKFST_SYMID_ILLEG;
(*pairClass) = -1;
(*searchState) = -1;
}
}
}
/* see description in header file */
extern void picokfst_kfstGetTrans (picokfst_FST this, picokfst_state_t startState, picokfst_class_t transClass,
picokfst_state_t * endState)
{
picoos_uint32 pos;
picoos_int32 index;
picoos_uint32 endStateX;
kfst_SubObj fst = (kfst_SubObj) this;
if ((startState < 1) || (startState > fst->nrStates) || (transClass < 1) || (transClass > fst->nrClasses)) {
(*endState) = 0;
} else {
index = (startState - 1) * fst->nrClasses + transClass - 1;
pos = fst->transTabPos + (index * fst->transTabEntrySize);
FixedBytesToUnsignedNum(fst->fstStream,fst->transTabEntrySize,& pos,& endStateX);
(*endState) = endStateX;
}
}
/* see description in header file */
extern void picokfst_kfstStartInEpsTransSearch (picokfst_FST this, picokfst_state_t startState,
picoos_bool * inEpsTransFound, picoos_int32 * searchState)
{
picoos_int32 offs;
picoos_uint32 pos;
kfst_SubObj fst = (kfst_SubObj) this;
(*searchState) = -1;
(*inEpsTransFound) = 0;
if ((startState > 0) && (startState <= fst->nrStates)) {
pos = fst->inEpsStateTabPos + (startState - 1) * 4;
FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
if (offs > 0) {
(*searchState) = fst->inEpsStateTabPos + offs;
(*inEpsTransFound) = 1;
}
}
}
/* see description in header file */
extern void picokfst_kfstGetNextInEpsTrans (picokfst_FST this, picoos_int32 * searchState,
picoos_bool * inEpsTransFound,
picokfst_symid_t * outSym, picokfst_state_t * endState)
{
picoos_uint32 pos;
picoos_int32 val;
kfst_SubObj fst = (kfst_SubObj) this;
if ((*searchState) < 0) {
(*inEpsTransFound) = 0;
(*outSym) = PICOKFST_SYMID_ILLEG;
(*endState) = 0;
} else {
pos = (*searchState);
BytesToNum(fst->fstStream,& pos,& val);
*outSym = (picokfst_symid_t)val;
if ((*outSym) != PICOKFST_SYMID_ILLEG) {
BytesToNum(fst->fstStream,& pos,& val);
*endState = (picokfst_state_t)val;
(*inEpsTransFound) = 1;
(*searchState) = pos;
} else {
(*inEpsTransFound) = 0;
(*outSym) = PICOKFST_SYMID_ILLEG;
(*endState) = 0;
(*searchState) = -1;
}
}
}
/* see description in header file */
extern picoos_bool picokfst_kfstIsAcceptingState (picokfst_FST this, picokfst_state_t state)
{
picoos_uint32 pos;
picoos_uint32 val;
kfst_SubObj fst = (kfst_SubObj) this;
if ((state > 0) && (state <= fst->nrStates)) {
pos = fst->accStateTabPos + (state - 1);
FixedBytesToUnsignedNum(fst->fstStream,1,& pos,& val);
return (val == 1);
} else {
return 0;
}
}
#ifdef __cplusplus
}
#endif
/* End picofst.c */