| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef PINYINIME_INCLUDE_DICTDEF_H__ |
| #define PINYINIME_INCLUDE_DICTDEF_H__ |
| |
| #include <stdlib.h> |
| #include "./utf16char.h" |
| |
| namespace ime_pinyin { |
| |
| // Enable the following line when building the binary dictionary model. |
| // #define ___BUILD_MODEL___ |
| |
| typedef unsigned char uint8; |
| typedef unsigned short uint16; |
| typedef unsigned int uint32; |
| |
| typedef signed char int8; |
| typedef short int16; |
| typedef int int32; |
| typedef long long int64; |
| typedef unsigned long long uint64; |
| |
| const bool kPrintDebug0 = false; |
| const bool kPrintDebug1 = false; |
| const bool kPrintDebug2 = false; |
| |
| // The max length of a lemma. |
| const size_t kMaxLemmaSize = 8; |
| |
| // The max length of a Pinyin (spelling). |
| const size_t kMaxPinyinSize = 6; |
| |
| // The number of half spelling ids. For Chinese Pinyin, there 30 half ids. |
| // See SpellingTrie.h for details. |
| const size_t kHalfSpellingIdNum = 29; |
| |
| // The maximum number of full spellings. For Chinese Pinyin, there are only |
| // about 410 spellings. |
| // If change this value is bigger(needs more bits), please also update |
| // other structures like SpellingNode, to make sure than a spelling id can be |
| // stored. |
| // -1 is because that 0 is never used. |
| const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; |
| const size_t kMaxSearchSteps = 40; |
| |
| // One character predicts its following characters. |
| const size_t kMaxPredictSize = (kMaxLemmaSize - 1); |
| |
| // LemmaIdType must always be size_t. |
| typedef size_t LemmaIdType; |
| const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. |
| const size_t kLemmaIdComposing = 0xffffff; |
| |
| typedef uint16 LmaScoreType; |
| typedef uint16 KeyScoreType; |
| |
| // Number of items with highest score are kept for prediction purpose. |
| const size_t kTopScoreLemmaNum = 10; |
| |
| const size_t kMaxPredictNumByGt3 = 1; |
| const size_t kMaxPredictNumBy3 = 2; |
| const size_t kMaxPredictNumBy2 = 2; |
| |
| // The last lemma id (included) for the system dictionary. The system |
| // dictionary's ids always start from 1. |
| const LemmaIdType kSysDictIdEnd = 500000; |
| |
| // The first lemma id for the user dictionary. |
| const LemmaIdType kUserDictIdStart = 500001; |
| |
| // The last lemma id (included) for the user dictionary. |
| const LemmaIdType kUserDictIdEnd = 600000; |
| |
| typedef struct { |
| uint16 half_splid:5; |
| uint16 full_splid:11; |
| } SpellingId, *PSpellingId; |
| |
| |
| /** |
| * We use different node types for different layers |
| * Statistical data of the building result for a testing dictionary: |
| * root, level 0, level 1, level 2, level 3 |
| * max son num of one node: 406 280 41 2 - |
| * max homo num of one node: 0 90 23 2 2 |
| * total node num of a layer: 1 406 31766 13516 993 |
| * total homo num of a layer: 9 5674 44609 12667 995 |
| * |
| * The node number for root and level 0 won't be larger than 500 |
| * According to the information above, two kinds of nodes can be used; one for |
| * root and level 0, the other for these layers deeper than 0. |
| * |
| * LE = less and equal, |
| * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K |
| */ |
| struct LmaNodeLE0 { |
| size_t son_1st_off; |
| size_t homo_idx_buf_off; |
| uint16 spl_idx; |
| uint16 num_of_son; |
| uint16 num_of_homo; |
| }; |
| |
| /** |
| * GE = great and equal |
| * A node occupies 8 bytes. |
| */ |
| struct LmaNodeGE1 { |
| uint16 son_1st_off_l; // Low bits of the son_1st_off |
| uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 |
| uint16 spl_idx; |
| unsigned char num_of_son; // number of son nodes |
| unsigned char num_of_homo; // number of homo words |
| unsigned char son_1st_off_h; // high bits of the son_1st_off |
| unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off |
| }; |
| |
| #ifdef ___BUILD_MODEL___ |
| struct SingleCharItem { |
| float freq; |
| char16 hz; |
| SpellingId splid; |
| }; |
| |
| struct LemmaEntry { |
| LemmaIdType idx_by_py; |
| LemmaIdType idx_by_hz; |
| char16 hanzi_str[kMaxLemmaSize + 1]; |
| |
| // The SingleCharItem id for each Hanzi. |
| uint16 hanzi_scis_ids[kMaxLemmaSize]; |
| |
| uint16 spl_idx_arr[kMaxLemmaSize + 1]; |
| char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; |
| unsigned char hz_str_len; |
| float freq; |
| }; |
| #endif // ___BUILD_MODEL___ |
| |
| } // namespace ime_pinyin |
| |
| #endif // PINYINIME_INCLUDE_DICTDEF_H__ |