| /* |
| * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /** |
| * @file picoktab.c |
| * |
| * symbol tables needed at runtime |
| * |
| * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland |
| * All rights reserved. |
| * |
| * History: |
| * - 2009-04-20 -- initial version |
| * |
| */ |
| |
| #include "picoos.h" |
| #include "picodbg.h" |
| #include "picoknow.h" |
| #include "picobase.h" |
| #include "picoktab.h" |
| #include "picodata.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| #if 0 |
| } |
| #endif |
| |
| |
| /** @todo : the following would be better part of a knowledge base. |
| * Make sure it is consistent with the phoneme symbol table used in the lingware */ |
| |
| /* PLANE_PHONEMES */ |
| |
| /* PLANE_POS */ |
| |
| /* PLANE_PB_STRENGTHS */ |
| |
| /* PLANE_ACCENTS */ |
| |
| /* PLANE_INTERN */ |
| #define PICOKTAB_TMPID_PHONSTART '\x26' /* 38 '&' */ |
| #define PICOKTAB_TMPID_PHONTERM '\x23' /* 35 '#' */ |
| |
| |
| /* ************************************************************/ |
| /* fixed ids */ |
| /* ************************************************************/ |
| |
| |
| static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this, |
| picoos_Common common) |
| { |
| picoktab_FixedIds ids; |
| |
| PICODBG_DEBUG(("start")); |
| |
| if (NULL == this || NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| ids = (picoktab_FixedIds) this->subObj; |
| |
| ids->phonStartId = PICOKTAB_TMPID_PHONSTART; |
| ids->phonTermId = PICOKTAB_TMPID_PHONTERM; |
| return PICO_OK; |
| } |
| |
| |
| static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this, |
| picoos_MemoryManager mm) |
| { |
| if (NULL != this) { |
| picoos_deallocate(mm, (void *) &this->subObj); |
| } |
| return PICO_OK; |
| } |
| |
| pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this, |
| picoos_Common common) |
| { |
| if (NULL == this) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| this->subDeallocate = ktabIdsSubObjDeallocate; |
| this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t)); |
| if (NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, |
| NULL, NULL); |
| } |
| return ktabIdsInitialize(this, common); |
| } |
| |
| picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this) |
| { |
| return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj)); |
| } |
| |
| |
| picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm) |
| { |
| picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this)); |
| if (NULL != this) { |
| /* initialize */ |
| } |
| return this; |
| } |
| |
| |
| void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this) |
| { |
| if (NULL != (*this)) { |
| /* terminate */ |
| picoos_deallocate(mm,(void *)this); |
| } |
| } |
| |
| |
| |
| /* ************************************************************/ |
| /* Graphs */ |
| /* ************************************************************/ |
| |
| /* overview binary file format for graphs kb: |
| |
| graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs |
| |
| NROFSENTRIES : 2 bytes, number of entries in offset table |
| SIZEOFSENTRY : 1 byte, size of one entry in offset table |
| |
| ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET) |
| |
| OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs |
| |
| graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph) |
| |
| graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2] |
| |
| FROM : 1..4 unsigned bytes, UTF8 character without terminating 0 |
| TO : 1..4 unsigned bytes, UTF8 character without terminating 0 |
| PROPSET : 1 unsigned byte, least significant bit : has TO field |
| next bit : has TOKENTYPE |
| next bit : has TOKENSUBTYPE |
| next bit : has VALUE |
| next bit : has LOWERCASE |
| next bit : has GRAPHSUBS1 |
| next bit : has GRAPHSUBS2 |
| next bit : has PUNC |
| |
| TOKENTYPE : 1 unsigned byte |
| TOKENSUBTYPE : 1 unsigned byte |
| VALUE : 1 unsigned byte |
| LOWERCASE : 1..4 unsigned bytes, UTF8 character without terminating 0 |
| GRAPHSUBS1 : 1..4 unsigned bytes, UTF8 character without terminating 0 |
| GRAPHSUBS2 : 1..4 unsigned bytes, UTF8 character without terminating 0 |
| PUNC : 1 unsigned byte |
| */ |
| |
| static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop); |
| |
| #define KTAB_START_GRAPHS_NR_OFFSET 0 |
| #define KTAB_START_GRAPHS_SIZE_OFFSET 2 |
| #define KTAB_START_GRAPHS_OFFSET_TABLE 3 |
| #define KTAB_START_GRAPHS_GRAPH_TABLE 0 |
| |
| /* bitmasks to extract the grapheme properties info from the property set */ |
| #define KTAB_GRAPH_PROPSET_TO ((picoos_uint8)'\x01') |
| #define KTAB_GRAPH_PROPSET_TOKENTYPE ((picoos_uint8)'\x02') |
| #define KTAB_GRAPH_PROPSET_TOKENSUBTYPE ((picoos_uint8)'\x04') |
| #define KTAB_GRAPH_PROPSET_VALUE ((picoos_uint8)'\x08') |
| #define KTAB_GRAPH_PROPSET_LOWERCASE ((picoos_uint8)'\x010') |
| #define KTAB_GRAPH_PROPSET_GRAPHSUBS1 ((picoos_uint8)'\x020') |
| #define KTAB_GRAPH_PROPSET_GRAPHSUBS2 ((picoos_uint8)'\x040') |
| #define KTAB_GRAPH_PROPSET_PUNCT ((picoos_uint8)'\x080') |
| |
| |
| typedef struct ktabgraphs_subobj *ktabgraphs_SubObj; |
| |
| typedef struct ktabgraphs_subobj { |
| picoos_uint16 nrOffset; |
| picoos_uint16 sizeOffset; |
| |
| picoos_uint8 * offsetTable; |
| picoos_uint8 * graphTable; |
| } ktabgraphs_subobj_t; |
| |
| |
| |
| static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this, |
| picoos_Common common) { |
| ktabgraphs_subobj_t * ktabgraphs; |
| |
| PICODBG_DEBUG(("start")); |
| |
| if (NULL == this || NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| ktabgraphs = (ktabgraphs_subobj_t *) this->subObj; |
| ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]); |
| ktabgraphs->sizeOffset = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]); |
| ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]); |
| ktabgraphs->graphTable = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]); |
| return PICO_OK; |
| } |
| |
| static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this, |
| picoos_MemoryManager mm) { |
| if (NULL != this) { |
| picoos_deallocate(mm, (void *) &this->subObj); |
| } |
| return PICO_OK; |
| } |
| |
| |
| pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this, |
| picoos_Common common) { |
| if (NULL == this) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| this->subDeallocate = ktabGraphsSubObjDeallocate; |
| this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t)); |
| if (NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, |
| NULL, NULL); |
| } |
| return ktabGraphsInitialize(this, common); |
| } |
| |
| |
| picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) { |
| if (NULL == this) { |
| return NULL; |
| } else { |
| return (picoktab_Graphs) this->subObj; |
| } |
| } |
| |
| |
| /* Graphs methods */ |
| |
| picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this, |
| const picoos_uint8 *graph, |
| const picoos_uint8 graphlenmax) { |
| |
| picoos_uint8 ui8App; |
| picoos_uint32 graphsOffset; |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| |
| ui8App = graphlenmax; /* avoid warning "var not used in this function"*/ |
| |
| graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph); |
| return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV; |
| } |
| |
| |
| static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str) |
| { |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| picoos_uint32 i, l; |
| |
| i = 0; |
| l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]); |
| while (i<l) { |
| str[i] = g->graphTable[graphsOffset+propOffset+i]; |
| i++; |
| } |
| str[l] = 0; |
| } |
| |
| |
| static picoos_uint32 ktab_propOffset(const picoktab_Graphs this, |
| picoos_uint32 graphsOffset, picoos_uint32 prop) |
| /* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table; |
| If the property is found, a value > 0 is returned otherwise 0 */ |
| { |
| picoos_uint32 n = 0; |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; |
| |
| if ((g->graphTable[graphsOffset] & prop) == prop) { |
| n = n + 1; /* overread PROPSET field */ |
| n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */ |
| if (prop > KTAB_GRAPH_PROPSET_TO) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO) |
| == KTAB_GRAPH_PROPSET_TO) { |
| n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE) |
| == KTAB_GRAPH_PROPSET_TOKENTYPE) { |
| n = n + 1; /* overread TOKENTYPE field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) |
| == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { |
| n = n + 1; /* overread stokentype field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_VALUE) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE) |
| == KTAB_GRAPH_PROPSET_VALUE) { |
| n = n + 1; /* overread value field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE) |
| == KTAB_GRAPH_PROPSET_LOWERCASE) { |
| n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1) |
| == KTAB_GRAPH_PROPSET_GRAPHSUBS1) { |
| n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2) |
| == KTAB_GRAPH_PROPSET_GRAPHSUBS2) { |
| n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */ |
| } |
| } else { |
| return n; |
| } |
| if (prop > KTAB_GRAPH_PROPSET_PUNCT) { |
| if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT) |
| == KTAB_GRAPH_PROPSET_PUNCT) { |
| n = n + 1; /* overread value field */ |
| } |
| } else { |
| return n; |
| } |
| } |
| |
| return n; |
| } |
| |
| |
| picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph) |
| { ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| picoos_int32 a, b, m; |
| picoos_uint32 graphsOffset; |
| picoos_uint32 propOffset; |
| picobase_utf8char from; |
| picobase_utf8char to; |
| picoos_bool utfGEfrom; |
| picoos_bool utfLEto; |
| |
| if (g->nrOffset > 0) { |
| a = 0; |
| b = g->nrOffset-1; |
| do { |
| m = (a+b) / 2; |
| |
| /* get offset to graph[m] */ |
| if (g->sizeOffset == 1) { |
| graphsOffset = g->offsetTable[g->sizeOffset*m]; |
| } |
| else { |
| graphsOffset = g->offsetTable[g->sizeOffset*m ] + |
| 256*g->offsetTable[g->sizeOffset*m + 1]; |
| /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset)); |
| */ |
| } |
| |
| /* get FROM and TO field of graph[m] */ |
| ktab_getStrProp(this, graphsOffset, 1, from); |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO); |
| if (propOffset > 0) { |
| ktab_getStrProp(this, graphsOffset, propOffset, to); |
| } |
| else { |
| picoos_strcpy((picoos_char *)to, (picoos_char *)from); |
| } |
| |
| /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to)); |
| */ |
| utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0; |
| utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0; |
| |
| if (utfGEfrom && utfLEto) { |
| /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph)); |
| */ |
| return graphsOffset; |
| } |
| if (!utfGEfrom) { |
| b = m-1; |
| } |
| else if (!utfLEto) { |
| a = m+1; |
| } |
| } while (a<=b); |
| } |
| PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph)); |
| return 0; |
| } |
| |
| |
| |
| |
| picoos_bool picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType) |
| { |
| picoos_uint32 propOffset; |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE); |
| if (propOffset > 0) { |
| *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]); |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| |
| |
| picoos_bool picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType) |
| { |
| picoos_uint32 propOffset; |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE); |
| if (propOffset > 0) { |
| *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]); |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| |
| picoos_bool picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value) |
| { |
| picoos_uint32 propOffset; |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE); |
| if (propOffset > 0) { |
| *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]); |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| |
| |
| picoos_bool picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2) |
| { |
| picoos_uint32 propOffset; |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT); |
| if (propOffset > 0) { |
| if (g->graphTable[graphsOffset+propOffset] == 2) { |
| *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND; |
| } |
| else { |
| *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND; |
| } |
| if (g->graphTable[graphsOffset+1] == '.') { |
| *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T; |
| } |
| else if (g->graphTable[graphsOffset+1] == '?') { |
| *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q; |
| } |
| else if (g->graphTable[graphsOffset+1] == '!') { |
| *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E; |
| } |
| else { |
| *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE; |
| } |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| |
| |
| picoos_bool picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase) |
| { |
| picoos_uint32 propOffset; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE); |
| if (propOffset > 0) { |
| ktab_getStrProp(this, graphsOffset, propOffset, lowercase); |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| |
| |
| picoos_bool picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1) |
| { |
| picoos_uint32 propOffset; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1); |
| if (propOffset > 0) { |
| ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1); |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| |
| |
| picoos_bool picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2) |
| { |
| picoos_uint32 propOffset; |
| |
| propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2); |
| if (propOffset > 0) { |
| ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2); |
| return TRUE; |
| } |
| else { |
| return FALSE; |
| } |
| } |
| /* *****************************************************************/ |
| /* used for tools */ |
| |
| static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to) |
| { |
| picoos_uint32 l; |
| l = picobase_det_utf8_length(**pos); |
| while (l>0) { |
| *(to++) = *((*pos)++); |
| l--; |
| } |
| *to = 0; |
| } |
| |
| picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this) |
| { |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; |
| return g->nrOffset; |
| } |
| |
| void picoktab_graphsGetGraphInfo(const picoktab_Graphs this, |
| picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to, |
| picoos_uint8 * propset, |
| picoos_uint8 * stokenType, picoos_uint8 * stokenSubType, |
| picoos_uint8 * value, picoos_uchar * lowercase, |
| picoos_uchar * graphsubs1, picoos_uchar * graphsubs2, |
| picoos_uint8 * punct) { |
| ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; |
| picoos_uint32 graphsOffset; |
| picoos_uint8 * pos; |
| |
| /* calculate offset of graph[graphIndex] */ |
| if (g->sizeOffset == 1) { |
| graphsOffset = g->offsetTable[graphIndex]; |
| } else { |
| graphsOffset = g->offsetTable[2 * graphIndex] |
| + (g->offsetTable[2 * graphIndex + 1] << 8); |
| } |
| pos = &(g->graphTable[graphsOffset]); |
| *propset = *pos; |
| |
| pos++; /* advance to FROM */ |
| ktab_getUtf8(&pos, from); /* get FROM and advance */ |
| if ((*propset) & KTAB_GRAPH_PROPSET_TO) { |
| ktab_getUtf8(&pos, to); /* get TO and advance */ |
| } else { |
| picoos_strcpy((picoos_char *)to, (picoos_char *)from); |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) { |
| (*stokenType) = *(pos++); /* get TOKENTYPE and advance */ |
| } else { |
| (*stokenType) = -1; |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { |
| (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */ |
| } else { |
| (*stokenSubType) = -1; |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) { |
| (*value) = *(pos++); /* get VALUE and advance */ |
| } else { |
| (*value) = -1; |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) { |
| ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */ |
| } else { |
| lowercase[0] = NULLC; |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) { |
| ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */ |
| } else { |
| graphsubs1[0] = NULLC; |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) { |
| ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */ |
| } else { |
| graphsubs2[0] = NULLC; |
| } |
| if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) { |
| (*punct) = *(pos++); /* get PUNCT and advance */ |
| } else { |
| (*punct) = -1; |
| } |
| } |
| |
| /* ************************************************************/ |
| /* Phones */ |
| /* ************************************************************/ |
| |
| /* overview binary file format for phones kb: |
| |
| phones-kb = specids propertytable |
| |
| specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1 |
| RESERVE1 RESERVE1 RESERVE1 |
| |
| propertytable = {PHONEPROP2}=256 |
| |
| PRIMSTRESSID1: one byte, ID of primary stress |
| SECSTRESSID1: one byte, ID of secondary stress |
| SYLLBOUNDID1: one byte, ID of syllable boundary |
| PAUSEID1: one byte, ID of pause |
| RESERVE1: reserved for future use |
| |
| PHONEPROP2: one byte, max. of 256 phones directly access this table |
| to check a property for a phone; binary properties |
| encoded (1 bit per prop) |
| least significant bit: vowel |
| next bit: diphth |
| next bit: glott |
| next bit: nonsyllvowel |
| next bit: syllcons |
| 3 bits spare |
| */ |
| |
| #define KTAB_START_SPECIDS 0 |
| #define KTAB_IND_PRIMSTRESS 0 |
| #define KTAB_IND_SECSTRESS 1 |
| #define KTAB_IND_SYLLBOUND 2 |
| #define KTAB_IND_PAUSE 3 |
| #define KTAB_IND_WORDBOUND 4 |
| |
| #define KTAB_START_PROPS 8 |
| |
| |
| typedef struct ktabphones_subobj *ktabphones_SubObj; |
| |
| typedef struct ktabphones_subobj { |
| picoos_uint8 *specids; |
| picoos_uint8 *props; |
| } ktabphones_subobj_t; |
| |
| |
| /* bitmasks to extract the property info from props */ |
| #define KTAB_PPROP_VOWEL '\x01' |
| #define KTAB_PPROP_DIPHTH '\x02' |
| #define KTAB_PPROP_GLOTT '\x04' |
| #define KTAB_PPROP_NONSYLLVOWEL '\x08' |
| #define KTAB_PPROP_SYLLCONS '\x10' |
| |
| |
| static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this, |
| picoos_Common common) { |
| ktabphones_subobj_t * ktabphones; |
| |
| PICODBG_DEBUG(("start")); |
| |
| if (NULL == this || NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| ktabphones = (ktabphones_subobj_t *) this->subObj; |
| ktabphones->specids = &(this->base[KTAB_START_SPECIDS]); |
| ktabphones->props = &(this->base[KTAB_START_PROPS]); |
| return PICO_OK; |
| } |
| |
| static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this, |
| picoos_MemoryManager mm) { |
| if (NULL != this) { |
| picoos_deallocate(mm, (void *) &this->subObj); |
| } |
| return PICO_OK; |
| } |
| |
| pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this, |
| picoos_Common common) { |
| if (NULL == this) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| this->subDeallocate = ktabPhonesSubObjDeallocate; |
| this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t)); |
| if (NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, |
| NULL, NULL); |
| } |
| return ktabPhonesInitialize(this, common); |
| } |
| |
| picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) { |
| if (NULL == this) { |
| return NULL; |
| } else { |
| return (picoktab_Phones) this->subObj; |
| } |
| } |
| |
| |
| /* Phones methods */ |
| |
| picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]); |
| } |
| picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]); |
| } |
| picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]); |
| } |
| picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]); |
| } |
| picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]); |
| } |
| |
| picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| picoos_uint8 props; |
| props = ((ktabphones_SubObj)this)->props[ch]; |
| return (((KTAB_PPROP_VOWEL & props) && |
| !(KTAB_PPROP_NONSYLLVOWEL & props)) |
| || (KTAB_PPROP_SYLLCONS & props)); |
| } |
| |
| picoos_bool picoktab_isPrimstress(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]); |
| } |
| picoos_bool picoktab_isSecstress(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]); |
| } |
| picoos_bool picoktab_isSyllbound(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]); |
| } |
| picoos_bool picoktab_isWordbound(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]); |
| } |
| picoos_bool picoktab_isPause(const picoktab_Phones this, |
| const picoos_uint8 ch) { |
| return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]); |
| } |
| |
| picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) { |
| return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]; |
| } |
| picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) { |
| return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]; |
| } |
| picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) { |
| return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]; |
| } |
| picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) { |
| return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]; |
| } |
| picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) { |
| return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]; |
| } |
| |
| /* ************************************************************/ |
| /* Pos */ |
| /* ************************************************************/ |
| |
| /* overview binary file format for pos kb: |
| |
| pos-kb = header posids |
| header = {COUNT2 OFFS2}=8 |
| posids = {POSID1 {PARTID1}0:8}1: |
| |
| where POSID1 is the value of the (combined) part-of-speech symbol, |
| and {PARTID1} are the symbol values of its components (empty if it |
| is not a combined symbol). The {PARTID1} list is sorted. |
| Part-of-speech symbols with equal number of components are grouped |
| together. |
| |
| The header contains information about these groups: |
| |
| COUNT2 specifies the number of elements in the group, and OFFS2 |
| specifies the offset (relative to the beginning of the kb) where |
| the group data starts, i.e.: |
| |
| 25 32 -> 25 not-combined elements, starting at offset 32 |
| 44 57 -> 44 elements composed of 2 symbols, starting at offset 57 |
| 23 189 -> 23 elements composed of 3 symbols, starting at offset 189 |
| ... |
| |
| Currently, each symbol may be composed of up to 8 other symbols. |
| Therefore, the header has 8 entries, too. The header starts with |
| the unique POS list, and then in increasing order, 2 symbols, 3 |
| symbols,... |
| |
| Zur Anschauung die ge-printf-te Version: |
| |
| 25 32 |
| 44 57 |
| 23 189 |
| 12 281 |
| 4 341 |
| 1 365 |
| 0 0 |
| 0 0 |
| 33 | |
| 34 | |
| 35 | |
| 60 | |
| etc. |
| 36 | 35 60 |
| 50 | 35 95 |
| 51 | 35 97 |
| 58 | 35 120 |
| 59 | 35 131 |
| 61 | 60 75 |
| 63 | 60 95 |
| 64 | 60 97 |
| etc. |
| 42 | 35 60 117 |
| 44 | 35 60 131 |
| 45 | 35 73 97 |
| 48 | 35 84 97 |
| 54 | 35 97 131 |
| 56 | 35 113 120 |
| 57 | 35 117 120 |
| 62 | 60 84 122 |
| etc. |
| */ |
| |
| typedef struct ktabpos_subobj *ktabpos_SubObj; |
| |
| typedef struct ktabpos_subobj { |
| picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB]; |
| picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB]; |
| } ktabpos_subobj_t; |
| |
| |
| static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this, |
| picoos_Common common) { |
| ktabpos_subobj_t *ktabpos; |
| picoos_uint16 osprev; |
| picoos_uint16 os, pos; |
| picoos_uint8 i; |
| |
| PICODBG_DEBUG(("start")); |
| |
| if (NULL == this || NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| ktabpos = (ktabpos_subobj_t *)this->subObj; |
| |
| os = 0; |
| for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) { |
| ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 | |
| this->base[pos]; |
| if (ktabpos->nrcomb[i] > 0) { |
| osprev = os; |
| os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2]; |
| ktabpos->nrcombstart[i] = &(this->base[os]); |
| PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos, |
| ktabpos->nrcomb[i], osprev, os)); |
| if (osprev >= os) { |
| /* cannot be, in a valid kb */ |
| return picoos_emRaiseException(common->em, |
| PICO_EXC_FILE_CORRUPT, |
| NULL, NULL); |
| } |
| } else { |
| if (i == 0) { |
| /* cannot be, in a valid kb */ |
| return picoos_emRaiseException(common->em, |
| PICO_EXC_FILE_CORRUPT, |
| NULL, NULL); |
| } |
| ktabpos->nrcombstart[i] = NULL; |
| } |
| } |
| return PICO_OK; |
| } |
| |
| static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this, |
| picoos_MemoryManager mm) { |
| if (NULL != this) { |
| picoos_deallocate(mm, (void *) &this->subObj); |
| } |
| return PICO_OK; |
| } |
| |
| pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this, |
| picoos_Common common) { |
| if (NULL == this) { |
| return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, |
| NULL, NULL); |
| } |
| this->subDeallocate = ktabPosSubObjDeallocate; |
| this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t)); |
| if (NULL == this->subObj) { |
| return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, |
| NULL, NULL); |
| } |
| return ktabPosInitialize(this, common); |
| } |
| |
| picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) { |
| if (NULL == this) { |
| return NULL; |
| } else { |
| return (picoktab_Pos) this->subObj; |
| } |
| } |
| |
| |
| /* Pos methods */ |
| |
| static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1, |
| const picoos_uint8 *grp2, |
| picoos_uint8 len) |
| { |
| /* if both, grp1 and grp2 would be sorted in ascending order |
| we could implement a function picoktab_comparePosGroup in |
| a similar manner as strcmp */ |
| |
| picoos_uint16 i, j, equal; |
| |
| equal = 1; |
| |
| i = 0; |
| while (equal && (i < len)) { |
| /* search grp1[i] in grp2 */ |
| j = 0; |
| while ((j < len) && (grp1[i] != grp2[j])) { |
| j++; |
| } |
| equal = (j < len); |
| i++; |
| } |
| |
| return equal; |
| } |
| |
| |
| picoos_bool picoktab_isUniquePos(const picoktab_Pos this, |
| const picoos_uint8 pos) { |
| ktabpos_subobj_t *ktabpos; |
| picoos_uint16 i; |
| |
| /* speed-up possible with e.g. binary search */ |
| |
| ktabpos = (ktabpos_subobj_t *)this; |
| PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0])); |
| i = 0; |
| while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) { |
| PICODBG_TRACE(("compare with pos %d at position %d", |
| ktabpos->nrcombstart[0][i], pos, i)); |
| i++; |
| } |
| return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i])); |
| } |
| |
| |
| picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this, |
| const picoos_uint8 pos, |
| const picoos_uint8 posgroup) |
| { |
| ktabpos_subobj_t *ktabpos; |
| picoos_uint8 *grp; |
| picoos_uint16 i, j, n, s, grplen; |
| picoos_uint8 *e; |
| picoos_uint8 found; |
| |
| ktabpos = (ktabpos_subobj_t *) this; |
| |
| grp = NULL; |
| found = FALSE; |
| grplen = 0; |
| |
| /* currently, a linear search is required to find 'posgroup'; the |
| knowledge base should be extended to allow for a faster search */ |
| |
| /* treat case i==0, grplen==0, ie. pos == posgroup */ |
| if (pos == posgroup) { |
| found = TRUE; |
| } |
| |
| i = 1; |
| while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) { |
| n = ktabpos->nrcomb[i]; /* number of entries */ |
| e = ktabpos->nrcombstart[i]; /* ptr to first entry */ |
| s = i + 2; /* size of an entry in bytes */ |
| /* was with while starting at 0: |
| s = i > 0 ? i + 2 : 1; |
| */ |
| j = 0; |
| while ((grp == NULL) && (j < n)) { |
| if (posgroup == e[0]) { |
| grp = e + 1; |
| grplen = s - 1; |
| } |
| e += s; |
| j++; |
| } |
| i++; |
| } |
| |
| /* test if 'pos' is contained in the components of 'posgroup' */ |
| if (grp != NULL) { |
| for (i = 0; !found && (i < grplen); i++) { |
| if (pos == grp[i]) { |
| found = TRUE; |
| } |
| } |
| |
| /* just a way to test picoktab_getPosGroup */ |
| /* |
| PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup); |
| */ |
| } |
| |
| return found; |
| } |
| |
| |
| picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this, |
| const picoos_uint8 *poslist, |
| const picoos_uint8 poslistlen) |
| { |
| picoos_uint8 poscomb; |
| ktabpos_subobj_t *ktabpos; |
| picoos_uint16 i, j, n, s; |
| picoos_uint8 *e; |
| |
| ktabpos = (ktabpos_subobj_t *) this; |
| poscomb = 0; |
| |
| if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) { |
| i = poslistlen - 1; |
| if (i > 0) { |
| n = ktabpos->nrcomb[i]; /* number of entries */ |
| e = ktabpos->nrcombstart[i]; /* ptr to first entry */ |
| s = i + 2; /* size of an entry in bytes */ |
| j = 0; |
| while (!poscomb && (j < n)) { |
| if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) { |
| poscomb = *e; |
| } |
| e += s; |
| j++; |
| } |
| if (!poscomb) { |
| /* combination not found; shouldn't occur if lingware OK! */ |
| /* contingency solution: take first */ |
| PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0])); |
| poscomb = poslist[0]; |
| } |
| } else { /* not a composed POS */ |
| poscomb = poslist[0]; |
| } |
| } |
| |
| return poscomb; |
| } |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| |
| /* end */ |