| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <assert.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include "../include/dictlist.h" |
| #include "../include/mystdlib.h" |
| #include "../include/ngram.h" |
| #include "../include/searchutility.h" |
| |
| namespace ime_pinyin { |
| |
| DictList::DictList() { |
| initialized_ = false; |
| scis_num_ = 0; |
| scis_hz_ = NULL; |
| scis_splid_ = NULL; |
| buf_ = NULL; |
| spl_trie_ = SpellingTrie::get_cpinstance(); |
| |
| assert(kMaxLemmaSize == 8); |
| cmp_func_[0] = cmp_hanzis_1; |
| cmp_func_[1] = cmp_hanzis_2; |
| cmp_func_[2] = cmp_hanzis_3; |
| cmp_func_[3] = cmp_hanzis_4; |
| cmp_func_[4] = cmp_hanzis_5; |
| cmp_func_[5] = cmp_hanzis_6; |
| cmp_func_[6] = cmp_hanzis_7; |
| cmp_func_[7] = cmp_hanzis_8; |
| } |
| |
| DictList::~DictList() { |
| free_resource(); |
| } |
| |
| bool DictList::alloc_resource(size_t buf_size, size_t scis_num) { |
| // Allocate memory |
| buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16))); |
| if (NULL == buf_) |
| return false; |
| |
| scis_num_ = scis_num; |
| |
| scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16))); |
| if (NULL == scis_hz_) |
| return false; |
| |
| scis_splid_ = static_cast<SpellingId*> |
| (malloc(scis_num_ * sizeof(SpellingId))); |
| |
| if (NULL == scis_splid_) |
| return false; |
| |
| return true; |
| } |
| |
| void DictList::free_resource() { |
| if (NULL != buf_) |
| free(buf_); |
| buf_ = NULL; |
| |
| if (NULL != scis_hz_) |
| free(scis_hz_); |
| scis_hz_ = NULL; |
| |
| if (NULL != scis_splid_) |
| free(scis_splid_); |
| scis_splid_ = NULL; |
| } |
| |
| #ifdef ___BUILD_MODEL___ |
| bool DictList::init_list(const SingleCharItem *scis, size_t scis_num, |
| const LemmaEntry *lemma_arr, size_t lemma_num) { |
| if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num) |
| return false; |
| |
| initialized_ = false; |
| |
| if (NULL != buf_) |
| free(buf_); |
| |
| // calculate the size |
| size_t buf_size = calculate_size(lemma_arr, lemma_num); |
| if (0 == buf_size) |
| return false; |
| |
| if (!alloc_resource(buf_size, scis_num)) |
| return false; |
| |
| fill_scis(scis, scis_num); |
| |
| // Copy the related content from the array to inner buffer |
| fill_list(lemma_arr, lemma_num); |
| |
| initialized_ = true; |
| return true; |
| } |
| |
| size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) { |
| size_t last_hz_len = 0; |
| size_t list_size = 0; |
| size_t id_num = 0; |
| |
| for (size_t i = 0; i < lemma_num; i++) { |
| if (0 == i) { |
| last_hz_len = lemma_arr[i].hz_str_len; |
| |
| assert(last_hz_len > 0); |
| assert(lemma_arr[0].idx_by_hz == 1); |
| |
| id_num++; |
| start_pos_[0] = 0; |
| start_id_[0] = id_num; |
| |
| last_hz_len = 1; |
| list_size += last_hz_len; |
| } else { |
| size_t current_hz_len = lemma_arr[i].hz_str_len; |
| |
| assert(current_hz_len >= last_hz_len); |
| |
| if (current_hz_len == last_hz_len) { |
| list_size += current_hz_len; |
| id_num++; |
| } else { |
| for (size_t len = last_hz_len; len < current_hz_len - 1; len++) { |
| start_pos_[len] = start_pos_[len - 1]; |
| start_id_[len] = start_id_[len - 1]; |
| } |
| |
| start_pos_[current_hz_len - 1] = list_size; |
| |
| id_num++; |
| start_id_[current_hz_len - 1] = id_num; |
| |
| last_hz_len = current_hz_len; |
| list_size += current_hz_len; |
| } |
| } |
| } |
| |
| for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) { |
| if (0 == i) { |
| start_pos_[0] = 0; |
| start_id_[0] = 1; |
| } else { |
| start_pos_[i] = list_size; |
| start_id_[i] = id_num; |
| } |
| } |
| |
| return start_pos_[kMaxLemmaSize]; |
| } |
| |
| void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) { |
| assert(scis_num_ == scis_num); |
| |
| for (size_t pos = 0; pos < scis_num_; pos++) { |
| scis_hz_[pos] = scis[pos].hz; |
| scis_splid_[pos] = scis[pos].splid; |
| } |
| } |
| |
| void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) { |
| size_t current_pos = 0; |
| |
| utf16_strncpy(buf_, lemma_arr[0].hanzi_str, |
| lemma_arr[0].hz_str_len); |
| |
| current_pos = lemma_arr[0].hz_str_len; |
| |
| size_t id_num = 1; |
| |
| for (size_t i = 1; i < lemma_num; i++) { |
| utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str, |
| lemma_arr[i].hz_str_len); |
| |
| id_num++; |
| current_pos += lemma_arr[i].hz_str_len; |
| } |
| |
| assert(current_pos == start_pos_[kMaxLemmaSize]); |
| assert(id_num == start_id_[kMaxLemmaSize]); |
| } |
| |
| char16* DictList::find_pos2_startedbyhz(char16 hz_char) { |
| char16 *found_2w = static_cast<char16*> |
| (mybsearch(&hz_char, buf_ + start_pos_[1], |
| (start_pos_[2] - start_pos_[1]) / 2, |
| sizeof(char16) * 2, cmp_hanzis_1)); |
| if (NULL == found_2w) |
| return NULL; |
| |
| while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1)) |
| found_2w -= 2; |
| |
| return found_2w; |
| } |
| #endif // ___BUILD_MODEL___ |
| |
| char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[], |
| size_t word_len, int (*cmp_func)(const void *, const void *)) { |
| char16 *found_w = static_cast<char16*> |
| (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1], |
| (start_pos_[word_len] - start_pos_[word_len - 1]) |
| / word_len, |
| sizeof(char16) * word_len, cmp_func)); |
| |
| if (NULL == found_w) |
| return NULL; |
| |
| while (found_w > buf_ + start_pos_[word_len -1] && |
| cmp_func(found_w, found_w - word_len) == 0) |
| found_w -= word_len; |
| |
| return found_w; |
| } |
| |
| size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len, |
| NPredictItem *npre_items, size_t npre_max, |
| size_t b4_used) { |
| assert(hzs_len <= kMaxPredictSize && hzs_len > 0); |
| |
| // 1. Prepare work |
| int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1]; |
| |
| NGram& ngram = NGram::get_instance(); |
| |
| size_t item_num = 0; |
| |
| // 2. Do prediction |
| for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len; |
| pre_len++) { |
| uint16 word_len = hzs_len + pre_len; |
| char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func); |
| if (NULL == w_buf) |
| continue; |
| while (w_buf < buf_ + start_pos_[word_len] && |
| cmp_func(w_buf, last_hzs) == 0 && |
| item_num < npre_max) { |
| memset(npre_items + item_num, 0, sizeof(NPredictItem)); |
| utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len); |
| npre_items[item_num].psb = |
| ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1]) |
| / word_len + start_id_[word_len - 1]); |
| npre_items[item_num].his_len = hzs_len; |
| item_num++; |
| w_buf += word_len; |
| } |
| } |
| |
| size_t new_num = 0; |
| for (size_t i = 0; i < item_num; i++) { |
| // Try to find it in the existing items |
| size_t e_pos; |
| for (e_pos = 1; e_pos <= b4_used; e_pos++) { |
| if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs, |
| kMaxPredictSize) == 0) |
| break; |
| } |
| if (e_pos <= b4_used) |
| continue; |
| |
| // If not found, append it to the buffer |
| npre_items[new_num] = npre_items[i]; |
| new_num++; |
| } |
| |
| return new_num; |
| } |
| |
| uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, |
| uint16 str_max) { |
| if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf |
| || str_max <= 1) |
| return 0; |
| |
| // Find the range |
| for (uint16 i = 0; i < kMaxLemmaSize; i++) { |
| if (i + 1 > str_max - 1) |
| return 0; |
| if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) { |
| size_t id_span = id_lemma - start_id_[i]; |
| |
| uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1); |
| for (uint16 len = 0; len <= i; len++) { |
| str_buf[len] = buf[len]; |
| } |
| str_buf[i+1] = (char16)'\0'; |
| return i + 1; |
| } |
| } |
| return 0; |
| } |
| |
| uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid, |
| uint16 *splids, uint16 max_splids) { |
| char16 *hz_found = static_cast<char16*> |
| (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1)); |
| assert(NULL != hz_found && hanzi == *hz_found); |
| |
| // Move to the first one. |
| while (hz_found > scis_hz_ && hanzi == *(hz_found - 1)) |
| hz_found--; |
| |
| // First try to found if strict comparison result is not zero. |
| char16 *hz_f = hz_found; |
| bool strict = false; |
| while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) { |
| uint16 pos = hz_f - scis_hz_; |
| if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) { |
| strict = true; |
| } |
| hz_f++; |
| } |
| |
| uint16 found_num = 0; |
| while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) { |
| uint16 pos = hz_found - scis_hz_; |
| if (0 == half_splid || |
| (strict && scis_splid_[pos].half_splid == half_splid) || |
| (!strict && spl_trie_->half_full_compatible(half_splid, |
| scis_splid_[pos].full_splid))) { |
| assert(found_num + 1 < max_splids); |
| splids[found_num] = scis_splid_[pos].full_splid; |
| found_num++; |
| } |
| hz_found++; |
| } |
| |
| return found_num; |
| } |
| |
| LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) { |
| if (NULL == str || str_len > kMaxLemmaSize) |
| return 0; |
| |
| char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]); |
| if (NULL == found) |
| return 0; |
| |
| assert(found > buf_); |
| assert(static_cast<size_t>(found - buf_) >= start_pos_[str_len - 1]); |
| return static_cast<LemmaIdType> |
| (start_id_[str_len - 1] + |
| (found - buf_ - start_pos_[str_len - 1]) / str_len); |
| } |
| |
| void DictList::convert_to_hanzis(char16 *str, uint16 str_len) { |
| assert(NULL != str); |
| |
| for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { |
| str[str_pos] = scis_hz_[str[str_pos]]; |
| } |
| } |
| |
| void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) { |
| assert(NULL != str); |
| |
| for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { |
| str[str_pos] = 0x100; |
| } |
| } |
| |
| bool DictList::save_list(FILE *fp) { |
| if (!initialized_ || NULL == fp) |
| return false; |
| |
| if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] || |
| NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_) |
| return false; |
| |
| if (fwrite(&scis_num_, sizeof(size_t), 1, fp) != 1) |
| return false; |
| |
| if (fwrite(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) != |
| kMaxLemmaSize + 1) |
| return false; |
| |
| if (fwrite(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) != |
| kMaxLemmaSize + 1) |
| return false; |
| |
| if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) |
| return false; |
| |
| if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) |
| return false; |
| |
| if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != |
| start_pos_[kMaxLemmaSize]) |
| return false; |
| |
| return true; |
| } |
| |
| bool DictList::load_list(FILE *fp) { |
| if (NULL == fp) |
| return false; |
| |
| initialized_ = false; |
| |
| if (fread(&scis_num_, sizeof(size_t), 1, fp) != 1) |
| return false; |
| |
| if (fread(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) != |
| kMaxLemmaSize + 1) |
| return false; |
| |
| if (fread(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) != |
| kMaxLemmaSize + 1) |
| return false; |
| |
| free_resource(); |
| |
| if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_)) |
| return false; |
| |
| if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) |
| return false; |
| |
| if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) |
| return false; |
| |
| if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != |
| start_pos_[kMaxLemmaSize]) |
| return false; |
| |
| initialized_ = true; |
| return true; |
| } |
| } // namespace ime_pinyin |