| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ |
| #define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ |
| |
| #include <stdlib.h> |
| #include "./atomdictbase.h" |
| #include "./dicttrie.h" |
| #include "./searchutility.h" |
| #include "./spellingtrie.h" |
| #include "./splparser.h" |
| |
| namespace ime_pinyin { |
| |
| static const size_t kMaxRowNum = kMaxSearchSteps; |
| |
| typedef struct { |
| // MileStoneHandle objects for the system and user dictionaries. |
| MileStoneHandle dict_handles[2]; |
| // From which DMI node. -1 means it's from root. |
| PoolPosType dmi_fr; |
| // The spelling id for the Pinyin string from the previous DMI to this node. |
| // If it is a half id like Shengmu, the node pointed by dict_node is the first |
| // node with this Shengmu, |
| uint16 spl_id; |
| // What's the level of the dict node. Level of root is 0, but root is never |
| // recorded by dict_node. |
| unsigned char dict_level:7; |
| // If this node is for composing phrase, this bit is 1. |
| unsigned char c_phrase:1; |
| // Whether the spl_id is parsed with a split character at the end. |
| unsigned char splid_end_split:1; |
| // What's the length of the spelling string for this match, for the whole |
| // word. |
| unsigned char splstr_len:7; |
| // Used to indicate whether all spelling ids from the root are full spelling |
| // ids. This information is useful for keymapping mode(not finished). Because |
| // in this mode, there is no clear boundaries, we prefer those results which |
| // have full spelling ids. |
| unsigned char all_full_id:1; |
| } DictMatchInfo, *PDictMatchInfo; |
| |
| typedef struct MatrixNode { |
| LemmaIdType id; |
| float score; |
| MatrixNode *from; |
| // From which DMI node. Used to trace the spelling segmentation. |
| PoolPosType dmi_fr; |
| uint16 step; |
| } MatrixNode, *PMatrixNode; |
| |
| typedef struct { |
| // The MatrixNode position in the matrix pool |
| PoolPosType mtrx_nd_pos; |
| // The DictMatchInfo position in the DictMatchInfo pool. |
| PoolPosType dmi_pos; |
| uint16 mtrx_nd_num; |
| uint16 dmi_num:15; |
| // Used to indicate whether there are dmi nodes in this step with full |
| // spelling id. This information is used to decide whether a substring of a |
| // valid Pinyin should be extended. |
| // |
| // Example1: shoudao |
| // When the last char 'o' is added, the parser will find "dao" is a valid |
| // Pinyin, and because all dmi nodes at location 'd' (including those for |
| // "shoud", and those for "d") have Shengmu id only, so it is not necessary |
| // to extend "ao", otherwise the result may be "shoud ao", that is not |
| // reasonable. |
| // |
| // Example2: hengao |
| // When the last 'o' is added, the parser finds "gao" is a valid Pinyin. |
| // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi |
| // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus |
| // "heng ao" can also be the result. |
| // |
| // Similarly, "ganga" is expanded to "gang a". |
| // |
| // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi |
| // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it |
| // is not valid either). If the parser uses break in the loop, the result |
| // always be "xian"; but if the parser uses continue in the loop, "xi an" will |
| // also be tried. This behaviour can be set via the function |
| // set_xi_an_switch(). |
| uint16 dmi_has_full_id:1; |
| // Points to a MatrixNode of the current step to indicate which choice the |
| // user selects. |
| MatrixNode *mtrx_nd_fixed; |
| } MatrixRow, *PMatrixRow; |
| |
| // When user inputs and selects candidates, the fixed lemma ids are stored in |
| // lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many |
| // lemmas from the beginning are fixed. If user deletes Pinyin characters one |
| // by one from the end, these fixed lemmas can be unlocked one by one when |
| // necessary. Whenever user deletes a Chinese character and its spelling string |
| // in these fixed lemmas, all fixed lemmas will be merged together into a unit |
| // named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing |
| // phrase will be the first lemma in the sentence. Because it contains some |
| // modified lemmas (by deleting a character), these merged lemmas are called |
| // sub lemmas (sublma), and each of them are represented individually, so that |
| // when user deletes Pinyin characters from the end, these sub lemmas can also |
| // be unlocked one by one. |
| typedef struct { |
| uint16 spl_ids[kMaxRowNum]; |
| uint16 spl_start[kMaxRowNum]; |
| char16 chn_str[kMaxRowNum]; // Chinese string. |
| uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters. |
| size_t sublma_num; |
| uint16 length; // Counted in Chinese characters. |
| } ComposingPhrase, *TComposingPhrase; |
| |
| class MatrixSearch { |
| private: |
| // If it is true, prediction list by string whose length is greater than 1 |
| // will be limited to a reasonable number. |
| static const bool kPredictLimitGt1 = false; |
| |
| // If it is true, the engine will prefer long history based prediction, |
| // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are |
| // based on the two-character history. |
| static const bool kPreferLongHistoryPredict = true; |
| |
| // If it is true, prediction will only be based on user dictionary. this flag |
| // is for debug purpose. |
| static const bool kOnlyUserDictPredict = false; |
| |
| // The maximum buffer to store LmaPsbItems. |
| static const size_t kMaxLmaPsbItems = 1450; |
| |
| // How many rows for each step. |
| static const size_t kMaxNodeARow = 5; |
| |
| // The maximum length of the sentence candidates counted in chinese |
| // characters |
| static const size_t kMaxSentenceLength = 16; |
| |
| // The size of the matrix node pool. |
| static const size_t kMtrxNdPoolSize = 200; |
| |
| // The size of the DMI node pool. |
| static const size_t kDmiPoolSize = 800; |
| |
| // Used to indicate whether this object has been initialized. |
| bool inited_; |
| |
| // Spelling trie. |
| const SpellingTrie *spl_trie_; |
| |
| // Used to indicate this switcher status: when "xian" is parseed, should |
| // "xi an" also be extended. Default is false. |
| // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string |
| // should be valid for a FULL spelling, or a combination of two spellings, |
| // first of which is a FULL id too. So even it is true, "da" will never be |
| // split into "d a", because "d" is not a full spelling id. |
| bool xi_an_enabled_; |
| |
| // System dictionary. |
| DictTrie* dict_trie_; |
| |
| // User dictionary. |
| AtomDictBase* user_dict_; |
| |
| // Spelling parser. |
| SpellingParser* spl_parser_; |
| |
| // The maximum allowed length of spelling string (such as a Pinyin string). |
| size_t max_sps_len_; |
| |
| // The maximum allowed length of a result Chinese string. |
| size_t max_hzs_len_; |
| |
| // Pinyin string. Max length: kMaxRowNum - 1 |
| char pys_[kMaxRowNum]; |
| |
| // The length of the string that has been decoded successfully. |
| size_t pys_decoded_len_; |
| |
| // Shared buffer for multiple purposes. |
| size_t *share_buf_; |
| |
| MatrixNode *mtrx_nd_pool_; |
| PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool |
| DictMatchInfo *dmi_pool_; |
| PoolPosType dmi_pool_used_; // How many items used in the pool |
| |
| MatrixRow *matrix_; // The first row is for starting |
| |
| DictExtPara *dep_; // Parameter used to extend DMI nodes. |
| |
| NPredictItem *npre_items_; // Used to do prediction |
| size_t npre_items_len_; |
| |
| // The starting positions and lemma ids for the full sentence candidate. |
| size_t lma_id_num_; |
| uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids. |
| LemmaIdType lma_id_[kMaxRowNum]; |
| size_t fixed_lmas_; |
| |
| // If fixed_lmas_ is bigger than i, Element i is used to indicate whether |
| // the i'th lemma id in lma_id_ is the first candidate for that step. |
| // If all candidates are the first one for that step, the whole string can be |
| // decoded by the engine automatically, so no need to add it to user |
| // dictionary. (We are considering to add it to user dictionary in the |
| // future). |
| uint8 fixed_lmas_no1_[kMaxRowNum]; |
| |
| // Composing phrase |
| ComposingPhrase c_phrase_; |
| |
| // If dmi_c_phrase_ is true, the decoder will try to match the |
| // composing phrase (And definitely it will match successfully). If it |
| // is false, the decoder will try to match lemmas items in dictionaries. |
| bool dmi_c_phrase_; |
| |
| // The starting positions and spelling ids for the first full sentence |
| // candidate. |
| size_t spl_id_num_; // Number of splling ids |
| uint16 spl_start_[kMaxRowNum]; // Starting positions |
| uint16 spl_id_[kMaxRowNum]; // Spelling ids |
| // Used to remember the last fixed position, counted in Hanzi. |
| size_t fixed_hzs_; |
| |
| // Lemma Items with possibility score, two purposes: |
| // 1. In Viterbi decoding, this buffer is used to get all possible candidates |
| // for current step; |
| // 2. When the search is done, this buffer is used to get candiates from the |
| // first un-fixed step and show them to the user. |
| LmaPsbItem lpi_items_[kMaxLmaPsbItems]; |
| size_t lpi_total_; |
| |
| // Assign the pointers with NULL. The caller makes sure that all pointers are |
| // not valid before calling it. This function only will be called in the |
| // construction function and free_resource(). |
| void reset_pointers_to_null(); |
| |
| bool alloc_resource(); |
| |
| void free_resource(); |
| |
| // Reset the search space totally. |
| bool reset_search0(); |
| |
| // Reset the search space from ch_pos step. For example, if the original |
| // input Pinyin is "an", reset_search(1) will reset the search space to the |
| // result of "a". If the given position is out of range, return false. |
| // if clear_fixed_this_step is true, and the ch_pos step is a fixed step, |
| // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes. |
| // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step. |
| // The DMI nodes will be kept. |
| // |
| // Note: this function should not destroy content of pys_. |
| bool reset_search(size_t ch_pos, bool clear_fixed_this_step, |
| bool clear_dmi_this_step, bool clear_mtrx_this_step); |
| |
| // Delete a part of the content in pys_. |
| void del_in_pys(size_t start, size_t len); |
| |
| // Delete a spelling id and its corresponding Chinese character, and merge |
| // the fixed lemmas into the composing phrase. |
| // del_spl_pos indicates which spelling id needs to be delete. |
| // This function will update the lemma and spelling segmentation information. |
| // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within |
| // the fixed lemmas. |
| void merge_fixed_lmas(size_t del_spl_pos); |
| |
| // Get spelling start posistions and ids. The result will be stored in |
| // spl_id_num_, spl_start_[], spl_id_[]. |
| // fixed_hzs_ will be also assigned. |
| void get_spl_start_id(); |
| |
| // Get all lemma ids with match the given spelling id stream(shorter than the |
| // maximum length of a word). |
| // If pfullsent is not NULL, means the full sentence candidate may be the |
| // same with the coming lemma string, if so, remove that lemma. |
| // The result is sorted in descendant order by the frequency score. |
| size_t get_lpis(const uint16* splid_str, size_t splid_str_len, |
| LmaPsbItem* lma_buf, size_t max_lma_buf, |
| const char16 *pfullsent, bool sort_by_psb); |
| |
| uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); |
| |
| uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, |
| uint16 splids_max, bool arg_valid); |
| |
| |
| // Extend a DMI node with a spelling id. ext_len is the length of the rows |
| // to extend, actually, it is the size of the spelling string of splid. |
| // return value can be 1 or 0. |
| // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in |
| // the pool). |
| // 0 means either the dmi node can not be extended with splid, or the splid |
| // is a Shengmu id, which is only used to get lpi_items, or the result node |
| // in DictTrie has no son, it is not nccessary to keep the new DMI. |
| // |
| // This function modifies the content of lpi_items_ and lpi_total_. |
| // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size. |
| // The function's returned value has no relation with the value of lpi_num. |
| // |
| // If dmi == NULL, this function will extend the root node of DictTrie |
| // |
| // This function will not change dmi_nd_pool_used_. Please change it after |
| // calling this function if necessary. |
| // |
| // The caller should guarantees that NULL != dep. |
| size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s); |
| |
| // Extend dmi for the composing phrase. |
| size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s); |
| |
| // Extend a MatrixNode with the give LmaPsbItem list. |
| // res_row is the destination row number. |
| // This function does not change mtrx_nd_pool_used_. Please change it after |
| // calling this function if necessary. |
| // return 0 always. |
| size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], |
| size_t lpi_num, PoolPosType dmi_fr, size_t res_row); |
| |
| |
| // Try to find a dmi node at step_to position, and the found dmi node should |
| // match the given spelling id strings. |
| PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num); |
| |
| bool add_char(char ch); |
| bool prepare_add_char(char ch); |
| |
| // Called after prepare_add_char, so the input char has been saved. |
| bool add_char_qwerty(); |
| |
| // Prepare candidates from the last fixed hanzi position. |
| void prepare_candidates(); |
| |
| // Is the character in step pos a splitter character? |
| // The caller guarantees that the position is valid. |
| bool is_split_at(uint16 pos); |
| |
| void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, |
| PoolPosType dmi_fr, |
| uint16 spl_id, uint16 node_num, unsigned char dict_level, |
| bool splid_end_split, unsigned char splstr_len, |
| unsigned char all_full_id); |
| |
| size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num, |
| char16 predict_buf[][kMaxPredictSize + 1], |
| size_t buf_len); |
| |
| // Add the first candidate to the user dictionary. |
| bool try_add_cand0_to_userdict(); |
| |
| // Add a user lemma to the user dictionary. This lemma is a subset of |
| // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the |
| // number of lemmas to be combined together as a new lemma. The caller |
| // gurantees that the combined new lemma's length is less or equal to |
| // kMaxLemmaSize. |
| bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score); |
| |
| // Update dictionary frequencies. |
| void update_dict_freq(); |
| |
| void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level); |
| |
| public: |
| MatrixSearch(); |
| ~MatrixSearch(); |
| |
| bool init(const char *fn_sys_dict, const char *fn_usr_dict); |
| |
| bool init_fd(int sys_fd, long start_offset, long length, |
| const char *fn_usr_dict); |
| |
| void set_max_lens(size_t max_sps_len, size_t max_hzs_len); |
| |
| void close(); |
| |
| void flush_cache(); |
| |
| void set_xi_an_switch(bool xi_an_enabled); |
| |
| bool get_xi_an_switch(); |
| |
| // Reset the search space. Equivalent to reset_search(0). |
| // If inited, always return true; |
| bool reset_search(); |
| |
| // Search a Pinyin string. |
| // Return value is the position successfully parsed. |
| size_t search(const char *py, size_t py_len); |
| |
| // Used to delete something in the Pinyin string kept by the engine, and do |
| // a re-search. |
| // Return value is the new length of Pinyin string kept by the engine which |
| // is parsed successfully. |
| // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin |
| // character needs to be deleted. If is_pos_in_splid is true, all Pinyin |
| // characters for pos-th spelling id needs to be deleted. |
| // If the deleted character(s) is just after a fixed lemma or sub lemma in |
| // composing phrase, clear_fixed_this_step indicates whether we needs to |
| // unlock the last fixed lemma or sub lemma. |
| // If is_pos_in_splid is false, and pos-th character is in the range for the |
| // fixed lemmas or composing string, this function will do nothing and just |
| // return the result of the previous search. |
| size_t delsearch(size_t pos, bool is_pos_in_splid, |
| bool clear_fixed_this_step); |
| |
| // Get the number of candiates, called after search(). |
| size_t get_candidate_num(); |
| |
| // Get the Pinyin string stored by the engine. |
| // *decoded_len returns the length of the successfully decoded string. |
| const char* get_pystr(size_t *decoded_len); |
| |
| // Get the spelling boundaries for the first sentence candidate. |
| // Number of spellings will be returned. The number of valid elements in |
| // spl_start is one more than the return value because the last one is used |
| // to indicate the beginning of the next un-input speling. |
| // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] . |
| size_t get_spl_start(const uint16 *&spl_start); |
| |
| // Get one candiate string. If full sentence candidate is available, it will |
| // be the first one. |
| char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len); |
| |
| // Get the first candiate, which is a "full sentence". |
| // retstr_len is not NULL, it will be used to return the string length. |
| // If only_unfixed is true, only unfixed part will be fetched. |
| char16* get_candidate0(char16* cand_str, size_t max_len, |
| uint16 *retstr_len, bool only_unfixed); |
| |
| // Choose a candidate. The decoder will do a search after the fixed position. |
| size_t choose(size_t cand_id); |
| |
| // Cancel the last choosing operation, and return the new number of choices. |
| size_t cancel_last_choice(); |
| |
| // Get the length of fixed Hanzis. |
| size_t get_fixedlen(); |
| |
| size_t get_predicts(const char16 fixed_buf[], |
| char16 predict_buf[][kMaxPredictSize + 1], |
| size_t buf_len); |
| }; |
| } |
| |
| #endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ |