| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef PINYINIME_INCLUDE_DICTTRIE_H__ |
| #define PINYINIME_INCLUDE_DICTTRIE_H__ |
| |
| #include <stdlib.h> |
| #include "./atomdictbase.h" |
| #include "./dictdef.h" |
| #include "./dictlist.h" |
| #include "./searchutility.h" |
| |
| namespace ime_pinyin { |
| |
| class DictTrie : AtomDictBase { |
| private: |
| typedef struct ParsingMark { |
| size_t node_offset:24; |
| size_t node_num:8; // Number of nodes with this spelling id given |
| // by spl_id. If spl_id is a Shengmu, for nodes |
| // in the first layer of DictTrie, it equals to |
| // SpellingTrie::shm2full_num(); but for those |
| // nodes which are not in the first layer, |
| // node_num < SpellingTrie::shm2full_num(). |
| // For a full spelling id, node_num = 1; |
| }; |
| |
| // Used to indicate an extended mile stone. |
| // An extended mile stone is used to mark a partial match in the dictionary |
| // trie to speed up further potential extending. |
| // For example, when the user inputs "w", a mile stone is created to mark the |
| // partial match status, so that when user inputs another char 'm', it will be |
| // faster to extend search space based on this mile stone. |
| // |
| // For partial match status of "wm", there can be more than one sub mile |
| // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so |
| // there may be more one parsing mark used to mark these partial matchings. |
| // A mile stone records the starting position in the mark list and number of |
| // marks. |
| struct MileStone { |
| uint16 mark_start; |
| uint16 mark_num; |
| }; |
| |
| DictList* dict_list_; |
| |
| const SpellingTrie *spl_trie_; |
| |
| LmaNodeLE0* root_; // Nodes for root and the first layer. |
| LmaNodeGE1* nodes_ge1_; // Nodes for other layers. |
| |
| // An quick index from spelling id to the LmaNodeLE0 node buffer, or |
| // to the root_ buffer. |
| // Index length: |
| // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used |
| // to get the end. |
| // All Shengmu ids are not indexed because they will be converted into |
| // corresponding full ids. |
| // So, given an id splid, the son is: |
| // root_[splid_le0_index_[splid - kFullSplIdStart]] |
| uint16 *splid_le0_index_; |
| |
| size_t lma_node_num_le0_; |
| size_t lma_node_num_ge1_; |
| |
| // The first part is for homophnies, and the last top_lma_num_ items are |
| // lemmas with highest scores. |
| unsigned char *lma_idx_buf_; |
| size_t lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. |
| size_t total_lma_num_; // Total number of lemmas in this dictionary. |
| size_t top_lmas_num_; // Number of lemma with highest scores. |
| |
| // Parsing mark list used to mark the detailed extended statuses. |
| ParsingMark *parsing_marks_; |
| // The position for next available mark. |
| uint16 parsing_marks_pos_; |
| |
| // Mile stone list used to mark the extended status. |
| MileStone *mile_stones_; |
| // The position for the next available mile stone. We use positions (except 0) |
| // as handles. |
| MileStoneHandle mile_stones_pos_; |
| |
| // Get the offset of sons for a node. |
| inline size_t get_son_offset(const LmaNodeGE1 *node); |
| |
| // Get the offset of homonious ids for a node. |
| inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); |
| |
| // Get the lemma id by the offset. |
| inline LemmaIdType get_lemma_id(size_t id_offset); |
| |
| void free_resource(bool free_dict_list); |
| |
| bool load_dict(FILE *fp); |
| |
| // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill |
| // them into the lpi_items buffer. |
| // This function is called by the search engine. |
| size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, |
| LmaNodeLE0 *node); |
| |
| // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill |
| // them into the lpi_items buffer. |
| // This function is called by inner functions extend_dict0(), extend_dict1() |
| // and extend_dict2(). |
| size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, |
| size_t homo_buf_off, LmaNodeGE1 *node, |
| uint16 lma_len); |
| |
| // Extend in the trie from level 0. |
| MileStoneHandle extend_dict0(MileStoneHandle from_handle, |
| const DictExtPara *dep, LmaPsbItem *lpi_items, |
| size_t lpi_max, size_t *lpi_num); |
| |
| // Extend in the trie from level 1. |
| MileStoneHandle extend_dict1(MileStoneHandle from_handle, |
| const DictExtPara *dep, LmaPsbItem *lpi_items, |
| size_t lpi_max, size_t *lpi_num); |
| |
| // Extend in the trie from level 2. |
| MileStoneHandle extend_dict2(MileStoneHandle from_handle, |
| const DictExtPara *dep, LmaPsbItem *lpi_items, |
| size_t lpi_max, size_t *lpi_num); |
| |
| // Try to extend the given spelling id buffer, and if the given id_lemma can |
| // be successfully gotten, return true; |
| // The given spelling ids are all valid full ids. |
| bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); |
| |
| #ifdef ___BUILD_MODEL___ |
| bool save_dict(FILE *fp); |
| #endif // ___BUILD_MODEL___ |
| |
| static const int kMaxMileStone = 100; |
| static const int kMaxParsingMark = 600; |
| static const MileStoneHandle kFirstValidMileStoneHandle = 1; |
| |
| friend class DictParser; |
| friend class DictBuilder; |
| |
| public: |
| |
| DictTrie(); |
| ~DictTrie(); |
| |
| #ifdef ___BUILD_MODEL___ |
| // Construct the tree from the file fn_raw. |
| // fn_validhzs provide the valid hanzi list. If fn_validhzs is |
| // NULL, only chars in GB2312 will be included. |
| bool build_dict(const char *fn_raw, const char *fn_validhzs); |
| |
| // Save the binary dictionary |
| // Actually, the SpellingTrie/DictList instance will be also saved. |
| bool save_dict(const char *filename); |
| #endif // ___BUILD_MODEL___ |
| |
| void convert_to_hanzis(char16 *str, uint16 str_len); |
| |
| void convert_to_scis_ids(char16 *str, uint16 str_len); |
| |
| // Load a binary dictionary |
| // The SpellingTrie instance/DictList will be also loaded |
| bool load_dict(const char *filename, LemmaIdType start_id, |
| LemmaIdType end_id); |
| bool load_dict_fd(int sys_fd, long start_offset, long length, |
| LemmaIdType start_id, LemmaIdType end_id); |
| bool close_dict() {return true;} |
| size_t number_of_lemmas() {return 0;} |
| |
| void reset_milestones(uint16 from_step, MileStoneHandle from_handle); |
| |
| MileStoneHandle extend_dict(MileStoneHandle from_handle, |
| const DictExtPara *dep, |
| LmaPsbItem *lpi_items, |
| size_t lpi_max, size_t *lpi_num); |
| |
| size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, |
| LmaPsbItem *lpi_items, size_t lpi_max); |
| |
| uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); |
| |
| uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, |
| uint16 splids_max, bool arg_valid); |
| |
| size_t predict(const char16 *last_hzs, uint16 hzs_len, |
| NPredictItem *npre_items, size_t npre_max, |
| size_t b4_used); |
| |
| LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len, uint16 count) {return 0;} |
| |
| LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, |
| bool selected) {return 0;} |
| |
| LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len) {return 0;} |
| |
| LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;} |
| |
| LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len) {return 0;} |
| |
| bool remove_lemma(LemmaIdType lemma_id) {return false;} |
| |
| size_t get_total_lemma_count() {return 0;} |
| void set_total_lemma_count_of_others(size_t count); |
| |
| void flush_cache() {} |
| |
| LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); |
| |
| // Fill the lemmas with highest scores to the prediction buffer. |
| // his_len is the history length to fill in the prediction buffer. |
| size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, |
| size_t npre_max, size_t b4_used); |
| }; |
| } |
| |
| #endif // PINYINIME_INCLUDE_DICTTRIE_H__ |