| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <assert.h> |
| #include "../include/splparser.h" |
| |
| namespace ime_pinyin { |
| |
| SpellingParser::SpellingParser() { |
| spl_trie_ = SpellingTrie::get_cpinstance(); |
| } |
| |
| bool SpellingParser::is_valid_to_parse(char ch) { |
| return SpellingTrie::is_valid_spl_char(ch); |
| } |
| |
| uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, |
| uint16 spl_idx[], uint16 start_pos[], |
| uint16 max_size, bool &last_is_pre) { |
| if (NULL == splstr || 0 == max_size || 0 == str_len) |
| return 0; |
| |
| if (!SpellingTrie::is_valid_spl_char(splstr[0])) |
| return 0; |
| |
| last_is_pre = false; |
| |
| const SpellingNode *node_this = spl_trie_->root_; |
| |
| uint16 str_pos = 0; |
| uint16 idx_num = 0; |
| if (NULL != start_pos) |
| start_pos[0] = 0; |
| bool last_is_splitter = false; |
| |
| while (str_pos < str_len) { |
| char char_this = splstr[str_pos]; |
| // all characters outside of [a, z] are considered as splitters |
| if (!SpellingTrie::is_valid_spl_char(char_this)) { |
| // test if the current node is endable |
| uint16 id_this = node_this->spelling_idx; |
| if (spl_trie_->if_valid_id_update(&id_this)) { |
| spl_idx[idx_num] = id_this; |
| |
| idx_num++; |
| str_pos++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| if (idx_num >= max_size) |
| return idx_num; |
| |
| node_this = spl_trie_->root_; |
| last_is_splitter = true; |
| continue; |
| } else { |
| if (last_is_splitter) { |
| str_pos++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| continue; |
| } else { |
| return idx_num; |
| } |
| } |
| } |
| |
| last_is_splitter = false; |
| |
| SpellingNode *found_son = NULL; |
| |
| if (0 == str_pos) { |
| if (char_this >= 'a') |
| found_son = spl_trie_->level1_sons_[char_this - 'a']; |
| else |
| found_son = spl_trie_->level1_sons_[char_this - 'A']; |
| } else { |
| SpellingNode *first_son = node_this->first_son; |
| // Because for Zh/Ch/Sh nodes, they are the last in the buffer and |
| // frequently used, so we scan from the end. |
| for (int i = 0; i < node_this->num_of_son; i++) { |
| SpellingNode *this_son = first_son + i; |
| if (SpellingTrie::is_same_spl_char( |
| this_son->char_this_node, char_this)) { |
| found_son = this_son; |
| break; |
| } |
| } |
| } |
| |
| // found, just move the current node pointer to the the son |
| if (NULL != found_son) { |
| node_this = found_son; |
| } else { |
| // not found, test if it is endable |
| uint16 id_this = node_this->spelling_idx; |
| if (spl_trie_->if_valid_id_update(&id_this)) { |
| // endable, remember the index |
| spl_idx[idx_num] = id_this; |
| |
| idx_num++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| if (idx_num >= max_size) |
| return idx_num; |
| node_this = spl_trie_->root_; |
| continue; |
| } else { |
| return idx_num; |
| } |
| } |
| |
| str_pos++; |
| } |
| |
| uint16 id_this = node_this->spelling_idx; |
| if (spl_trie_->if_valid_id_update(&id_this)) { |
| // endable, remember the index |
| spl_idx[idx_num] = id_this; |
| |
| idx_num++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| } |
| |
| last_is_pre = !last_is_splitter; |
| |
| return idx_num; |
| } |
| |
| uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, |
| uint16 spl_idx[], uint16 start_pos[], |
| uint16 max_size, bool &last_is_pre) { |
| uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, |
| max_size, last_is_pre); |
| for (uint16 pos = 0; pos < idx_num; pos++) { |
| if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { |
| spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); |
| if (pos == idx_num - 1) { |
| last_is_pre = false; |
| } |
| } |
| } |
| return idx_num; |
| } |
| |
| uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, |
| uint16 spl_idx[], uint16 start_pos[], |
| uint16 max_size, bool &last_is_pre) { |
| if (NULL == splstr || 0 == max_size || 0 == str_len) |
| return 0; |
| |
| if (!SpellingTrie::is_valid_spl_char(splstr[0])) |
| return 0; |
| |
| last_is_pre = false; |
| |
| const SpellingNode *node_this = spl_trie_->root_; |
| |
| uint16 str_pos = 0; |
| uint16 idx_num = 0; |
| if (NULL != start_pos) |
| start_pos[0] = 0; |
| bool last_is_splitter = false; |
| |
| while (str_pos < str_len) { |
| char16 char_this = splstr[str_pos]; |
| // all characters outside of [a, z] are considered as splitters |
| if (!SpellingTrie::is_valid_spl_char(char_this)) { |
| // test if the current node is endable |
| uint16 id_this = node_this->spelling_idx; |
| if (spl_trie_->if_valid_id_update(&id_this)) { |
| spl_idx[idx_num] = id_this; |
| |
| idx_num++; |
| str_pos++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| if (idx_num >= max_size) |
| return idx_num; |
| |
| node_this = spl_trie_->root_; |
| last_is_splitter = true; |
| continue; |
| } else { |
| if (last_is_splitter) { |
| str_pos++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| continue; |
| } else { |
| return idx_num; |
| } |
| } |
| } |
| |
| last_is_splitter = false; |
| |
| SpellingNode *found_son = NULL; |
| |
| if (0 == str_pos) { |
| if (char_this >= 'a') |
| found_son = spl_trie_->level1_sons_[char_this - 'a']; |
| else |
| found_son = spl_trie_->level1_sons_[char_this - 'A']; |
| } else { |
| SpellingNode *first_son = node_this->first_son; |
| // Because for Zh/Ch/Sh nodes, they are the last in the buffer and |
| // frequently used, so we scan from the end. |
| for (int i = 0; i < node_this->num_of_son; i++) { |
| SpellingNode *this_son = first_son + i; |
| if (SpellingTrie::is_same_spl_char( |
| this_son->char_this_node, char_this)) { |
| found_son = this_son; |
| break; |
| } |
| } |
| } |
| |
| // found, just move the current node pointer to the the son |
| if (NULL != found_son) { |
| node_this = found_son; |
| } else { |
| // not found, test if it is endable |
| uint16 id_this = node_this->spelling_idx; |
| if (spl_trie_->if_valid_id_update(&id_this)) { |
| // endable, remember the index |
| spl_idx[idx_num] = id_this; |
| |
| idx_num++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| if (idx_num >= max_size) |
| return idx_num; |
| node_this = spl_trie_->root_; |
| continue; |
| } else { |
| return idx_num; |
| } |
| } |
| |
| str_pos++; |
| } |
| |
| uint16 id_this = node_this->spelling_idx; |
| if (spl_trie_->if_valid_id_update(&id_this)) { |
| // endable, remember the index |
| spl_idx[idx_num] = id_this; |
| |
| idx_num++; |
| if (NULL != start_pos) |
| start_pos[idx_num] = str_pos; |
| } |
| |
| last_is_pre = !last_is_splitter; |
| |
| return idx_num; |
| } |
| |
| uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, |
| uint16 spl_idx[], uint16 start_pos[], |
| uint16 max_size, bool &last_is_pre) { |
| uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, |
| max_size, last_is_pre); |
| for (uint16 pos = 0; pos < idx_num; pos++) { |
| if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { |
| spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); |
| if (pos == idx_num - 1) { |
| last_is_pre = false; |
| } |
| } |
| } |
| return idx_num; |
| } |
| |
| uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, |
| bool *is_pre) { |
| if (NULL == is_pre) |
| return 0; |
| |
| uint16 spl_idx[2]; |
| uint16 start_pos[3]; |
| |
| if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) |
| return 0; |
| |
| if (start_pos[1] != str_len) |
| return 0; |
| return spl_idx[0]; |
| } |
| |
| uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, |
| bool *is_pre) { |
| if (NULL == is_pre) |
| return 0; |
| |
| uint16 spl_idx[2]; |
| uint16 start_pos[3]; |
| |
| if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) |
| return 0; |
| |
| if (start_pos[1] != str_len) |
| return 0; |
| if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { |
| spl_trie_->half_to_full(spl_idx[0], spl_idx); |
| *is_pre = false; |
| } |
| |
| return spl_idx[0]; |
| } |
| |
| uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, |
| uint16 splidx[], uint16 max_size, |
| uint16 &full_id_num, bool &is_pre) { |
| if (max_size <= 0 || !is_valid_to_parse(splstr[0])) |
| return 0; |
| |
| splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); |
| full_id_num = 0; |
| if (0 != splidx[0]) { |
| if (splidx[0] >= kFullSplIdStart) |
| full_id_num = 1; |
| return 1; |
| } |
| return 0; |
| } |
| |
| } // namespace ime_pinyin |