| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <assert.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "../include/dictbuilder.h" |
| #include "../include/dicttrie.h" |
| #include "../include/mystdlib.h" |
| #include "../include/ngram.h" |
| #include "../include/searchutility.h" |
| #include "../include/spellingtable.h" |
| #include "../include/spellingtrie.h" |
| #include "../include/splparser.h" |
| #include "../include/utf16reader.h" |
| |
| namespace ime_pinyin { |
| |
| #ifdef ___BUILD_MODEL___ |
| |
| static const size_t kReadBufLen = 512; |
| static const size_t kSplTableHashLen = 2000; |
| |
| // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by |
| // frequencies. |
| int cmp_scis_hz_splid_freq(const void* p1, const void* p2) { |
| const SingleCharItem *s1, *s2; |
| s1 = static_cast<const SingleCharItem*>(p1); |
| s2 = static_cast<const SingleCharItem*>(p2); |
| |
| if (s1->hz < s2->hz) |
| return -1; |
| if (s1->hz > s2->hz) |
| return 1; |
| |
| if (s1->splid.half_splid < s2->splid.half_splid) |
| return -1; |
| if (s1->splid.half_splid > s2->splid.half_splid) |
| return 1; |
| |
| if (s1->splid.full_splid < s2->splid.full_splid) |
| return -1; |
| if (s1->splid.full_splid > s2->splid.full_splid) |
| return 1; |
| |
| if (s1->freq > s2->freq) |
| return -1; |
| if (s1->freq < s2->freq) |
| return 1; |
| return 0; |
| } |
| |
| int cmp_scis_hz_splid(const void* p1, const void* p2) { |
| const SingleCharItem *s1, *s2; |
| s1 = static_cast<const SingleCharItem*>(p1); |
| s2 = static_cast<const SingleCharItem*>(p2); |
| |
| if (s1->hz < s2->hz) |
| return -1; |
| if (s1->hz > s2->hz) |
| return 1; |
| |
| if (s1->splid.half_splid < s2->splid.half_splid) |
| return -1; |
| if (s1->splid.half_splid > s2->splid.half_splid) |
| return 1; |
| |
| if (s1->splid.full_splid < s2->splid.full_splid) |
| return -1; |
| if (s1->splid.full_splid > s2->splid.full_splid) |
| return 1; |
| |
| return 0; |
| } |
| |
| int cmp_lemma_entry_hzs(const void* p1, const void* p2) { |
| size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); |
| size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); |
| if (size1 < size2) |
| return -1; |
| else if (size1 > size2) |
| return 1; |
| |
| return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, |
| ((const LemmaEntry*)p2)->hanzi_str); |
| } |
| |
| int compare_char16(const void* p1, const void* p2) { |
| if (*((const char16*)p1) < *((const char16*)p2)) |
| return -1; |
| if (*((const char16*)p1) > *((const char16*)p2)) |
| return 1; |
| return 0; |
| } |
| |
| int compare_py(const void* p1, const void* p2) { |
| int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, |
| ((const LemmaEntry*)p2)->spl_idx_arr); |
| |
| if (0 != ret) |
| return ret; |
| |
| return static_cast<int>(((const LemmaEntry*)p2)->freq) - |
| static_cast<int>(((const LemmaEntry*)p1)->freq); |
| } |
| |
| // First hanzi, if the same, then Pinyin |
| int cmp_lemma_entry_hzspys(const void* p1, const void* p2) { |
| size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); |
| size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); |
| if (size1 < size2) |
| return -1; |
| else if (size1 > size2) |
| return 1; |
| int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, |
| ((const LemmaEntry*)p2)->hanzi_str); |
| |
| if (0 != ret) |
| return ret; |
| |
| ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, |
| ((const LemmaEntry*)p2)->spl_idx_arr); |
| return ret; |
| } |
| |
| int compare_splid2(const void* p1, const void* p2) { |
| int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, |
| ((const LemmaEntry*)p2)->spl_idx_arr); |
| return ret; |
| } |
| |
| DictBuilder::DictBuilder() { |
| lemma_arr_ = NULL; |
| lemma_num_ = 0; |
| |
| scis_ = NULL; |
| scis_num_ = 0; |
| |
| lma_nodes_le0_ = NULL; |
| lma_nodes_ge1_ = NULL; |
| |
| lma_nds_used_num_le0_ = 0; |
| lma_nds_used_num_ge1_ = 0; |
| |
| homo_idx_buf_ = NULL; |
| homo_idx_num_eq1_ = 0; |
| homo_idx_num_gt1_ = 0; |
| |
| top_lmas_ = NULL; |
| top_lmas_num_ = 0; |
| |
| spl_table_ = NULL; |
| spl_parser_ = NULL; |
| } |
| |
| DictBuilder::~DictBuilder() { |
| free_resource(); |
| } |
| |
| bool DictBuilder::alloc_resource(size_t lma_num) { |
| if (0 == lma_num) |
| return false; |
| |
| free_resource(); |
| |
| lemma_num_ = lma_num; |
| lemma_arr_ = new LemmaEntry[lemma_num_]; |
| |
| top_lmas_num_ = 0; |
| top_lmas_ = new LemmaEntry[kTopScoreLemmaNum]; |
| |
| // New the scis_ buffer to the possible maximum size. |
| scis_num_ = lemma_num_ * kMaxLemmaSize; |
| scis_ = new SingleCharItem[scis_num_]; |
| |
| // The root and first level nodes is less than kMaxSpellingNum + 1 |
| lma_nds_used_num_le0_ = 0; |
| lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1]; |
| |
| // Other nodes is less than lemma_num |
| lma_nds_used_num_ge1_ = 0; |
| lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_]; |
| |
| homo_idx_buf_ = new LemmaIdType[lemma_num_]; |
| spl_table_ = new SpellingTable(); |
| spl_parser_ = new SpellingParser(); |
| |
| if (NULL == lemma_arr_ || NULL == top_lmas_ || |
| NULL == scis_ || NULL == spl_table_ || |
| NULL == spl_parser_ || NULL == lma_nodes_le0_ || |
| NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) { |
| free_resource(); |
| return false; |
| } |
| |
| memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_); |
| memset(scis_, 0, sizeof(SingleCharItem) * scis_num_); |
| memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1)); |
| memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_); |
| memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_); |
| spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true); |
| |
| return true; |
| } |
| |
| char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) { |
| if (NULL == fn_validhzs || NULL == num) |
| return NULL; |
| |
| *num = 0; |
| FILE *fp = fopen(fn_validhzs, "rb"); |
| if (NULL == fp) |
| return NULL; |
| |
| char16 utf16header; |
| if (fread(&utf16header, sizeof(char16), 1, fp) != 1 || |
| 0xfeff != utf16header) { |
| fclose(fp); |
| return NULL; |
| } |
| |
| fseek(fp, 0, SEEK_END); |
| *num = ftell(fp) / sizeof(char16); |
| assert(*num >= 1); |
| *num -= 1; |
| |
| char16 *hzs = new char16[*num]; |
| if (NULL == hzs) { |
| fclose(fp); |
| return NULL; |
| } |
| |
| fseek(fp, 2, SEEK_SET); |
| |
| if (fread(hzs, sizeof(char16), *num, fp) != *num) { |
| fclose(fp); |
| delete [] hzs; |
| return NULL; |
| } |
| fclose(fp); |
| |
| myqsort(hzs, *num, sizeof(char16), compare_char16); |
| return hzs; |
| } |
| |
| bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, |
| char16 hz) { |
| if (NULL == hzs) |
| return false; |
| |
| char16 *found; |
| found = static_cast<char16*>( |
| mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16)); |
| if (NULL == found) |
| return false; |
| |
| assert(*found == hz); |
| return true; |
| } |
| |
| // The caller makes sure that the parameters are valid. |
| bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len, |
| const char16 *str, size_t str_len) { |
| if (NULL == hzs || NULL == str) |
| return false; |
| |
| for (size_t pos = 0; pos < str_len; pos++) { |
| if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) |
| return false; |
| } |
| return true; |
| } |
| |
| void DictBuilder::get_top_lemmas() { |
| top_lmas_num_ = 0; |
| if (NULL == lemma_arr_) |
| return; |
| |
| for (size_t pos = 0; pos < lemma_num_; pos++) { |
| if (0 == top_lmas_num_) { |
| top_lmas_[0] = lemma_arr_[pos]; |
| top_lmas_num_ = 1; |
| continue; |
| } |
| |
| if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) { |
| if (kTopScoreLemmaNum > top_lmas_num_) |
| top_lmas_num_ += 1; |
| |
| size_t move_pos; |
| for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) { |
| top_lmas_[move_pos] = top_lmas_[move_pos - 1]; |
| if (0 == move_pos - 1 || |
| (move_pos - 1 > 0 && |
| top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) { |
| break; |
| } |
| } |
| assert(move_pos > 0); |
| top_lmas_[move_pos - 1] = lemma_arr_[pos]; |
| } else if (kTopScoreLemmaNum > top_lmas_num_) { |
| top_lmas_[top_lmas_num_] = lemma_arr_[pos]; |
| top_lmas_num_ += 1; |
| } |
| } |
| |
| if (kPrintDebug0) { |
| printf("\n------Top Lemmas------------------\n"); |
| for (size_t pos = 0; pos < top_lmas_num_; pos++) { |
| printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz, |
| top_lmas_[pos].freq); |
| } |
| } |
| } |
| |
| void DictBuilder::free_resource() { |
| if (NULL != lemma_arr_) |
| delete [] lemma_arr_; |
| |
| if (NULL != scis_) |
| delete [] scis_; |
| |
| if (NULL != lma_nodes_le0_) |
| delete [] lma_nodes_le0_; |
| |
| if (NULL != lma_nodes_ge1_) |
| delete [] lma_nodes_ge1_; |
| |
| if (NULL != homo_idx_buf_) |
| delete [] homo_idx_buf_; |
| |
| if (NULL != spl_table_) |
| delete spl_table_; |
| |
| if (NULL != spl_parser_) |
| delete spl_parser_; |
| |
| lemma_arr_ = NULL; |
| scis_ = NULL; |
| lma_nodes_le0_ = NULL; |
| lma_nodes_ge1_ = NULL; |
| homo_idx_buf_ = NULL; |
| spl_table_ = NULL; |
| spl_parser_ = NULL; |
| |
| lemma_num_ = 0; |
| lma_nds_used_num_le0_ = 0; |
| lma_nds_used_num_ge1_ = 0; |
| homo_idx_num_eq1_ = 0; |
| homo_idx_num_gt1_ = 0; |
| } |
| |
| size_t DictBuilder::read_raw_dict(const char* fn_raw, |
| const char *fn_validhzs, |
| size_t max_item) { |
| if (NULL == fn_raw) return 0; |
| |
| Utf16Reader utf16_reader; |
| if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) |
| return false; |
| |
| char16 read_buf[kReadBufLen]; |
| |
| // Read the number of lemmas in the file |
| size_t lemma_num = 240000; |
| |
| // allocate resource required |
| if (!alloc_resource(lemma_num)) { |
| utf16_reader.close(); |
| } |
| |
| // Read the valid Hanzi list. |
| char16 *valid_hzs = NULL; |
| size_t valid_hzs_num = 0; |
| valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num); |
| |
| // Begin reading the lemma entries |
| for (size_t i = 0; i < max_item; i++) { |
| // read next entry |
| if (!utf16_reader.readline(read_buf, kReadBufLen)) { |
| lemma_num = i; |
| break; |
| } |
| |
| size_t token_size; |
| char16 *token; |
| char16 *to_tokenize = read_buf; |
| |
| // Get the Hanzi string |
| token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); |
| if (NULL == token) { |
| free_resource(); |
| utf16_reader.close(); |
| return false; |
| } |
| |
| size_t lemma_size = utf16_strlen(token); |
| |
| if (lemma_size > kMaxLemmaSize) { |
| i--; |
| continue; |
| } |
| |
| if (lemma_size > 4) { |
| i--; |
| continue; |
| } |
| |
| // Copy to the lemma entry |
| utf16_strcpy(lemma_arr_[i].hanzi_str, token); |
| |
| lemma_arr_[i].hz_str_len = token_size; |
| |
| // Get the freq string |
| token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); |
| if (NULL == token) { |
| free_resource(); |
| utf16_reader.close(); |
| return false; |
| } |
| lemma_arr_[i].freq = utf16_atof(token); |
| |
| if (lemma_size > 1 && lemma_arr_[i].freq < 60) { |
| i--; |
| continue; |
| } |
| |
| // Get GBK mark, if no valid Hanzi list available, all items which contains |
| // GBK characters will be discarded. Otherwise, all items which contains |
| // characters outside of the valid Hanzi list will be discarded. |
| token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); |
| assert(NULL != token); |
| int gbk_flag = utf16_atoi(token); |
| if (NULL == valid_hzs || 0 == valid_hzs_num) { |
| if (0 != gbk_flag) { |
| i--; |
| continue; |
| } |
| } else { |
| if (!str_in_hanzis_list(valid_hzs, valid_hzs_num, |
| lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) { |
| i--; |
| continue; |
| } |
| } |
| |
| // Get spelling String |
| bool spelling_not_support = false; |
| for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; |
| hz_pos++) { |
| // Get a Pinyin |
| token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); |
| if (NULL == token) { |
| free_resource(); |
| utf16_reader.close(); |
| return false; |
| } |
| |
| assert(utf16_strlen(token) <= kMaxPinyinSize); |
| |
| utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token); |
| |
| format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]); |
| |
| // Put the pinyin to the spelling table |
| if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos], |
| lemma_arr_[i].freq)) { |
| spelling_not_support = true; |
| break; |
| } |
| } |
| |
| // The whole line must have been parsed fully, otherwise discard this one. |
| token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); |
| if (spelling_not_support || NULL != token) { |
| i--; |
| continue; |
| } |
| } |
| |
| delete [] valid_hzs; |
| utf16_reader.close(); |
| |
| printf("read succesfully, lemma num: %d\n", lemma_num); |
| |
| return lemma_num; |
| } |
| |
| bool DictBuilder::build_dict(const char *fn_raw, |
| const char *fn_validhzs, |
| DictTrie *dict_trie) { |
| if (NULL == fn_raw || NULL == dict_trie) |
| return false; |
| |
| lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000); |
| if (0 == lemma_num_) |
| return false; |
| |
| // Arrange the spelling table, and build a spelling tree |
| // The size of an spelling. '\0' is included. If the spelling table is |
| // initialized to calculate the spelling scores, the last char in the |
| // spelling string will be score, and it is also included in spl_item_size. |
| size_t spl_item_size; |
| size_t spl_num; |
| const char* spl_buf; |
| spl_buf = spl_table_->arrange(&spl_item_size, &spl_num); |
| if (NULL == spl_buf) { |
| free_resource(); |
| return false; |
| } |
| |
| SpellingTrie &spl_trie = SpellingTrie::get_instance(); |
| |
| if (!spl_trie.construct(spl_buf, spl_item_size, spl_num, |
| spl_table_->get_score_amplifier(), |
| spl_table_->get_average_score())) { |
| free_resource(); |
| return false; |
| } |
| |
| printf("spelling tree construct successfully.\n"); |
| |
| // Convert the spelling string to idxs |
| for (size_t i = 0; i < lemma_num_; i++) { |
| for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; |
| hz_pos++) { |
| uint16 spl_idxs[2]; |
| uint16 spl_start_pos[3]; |
| bool is_pre = true; |
| int spl_idx_num = |
| spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos], |
| strlen(lemma_arr_[i].pinyin_str[hz_pos]), |
| spl_idxs, spl_start_pos, 2, is_pre); |
| assert(1 == spl_idx_num); |
| |
| if (spl_trie.is_half_id(spl_idxs[0])) { |
| uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs); |
| assert(0 != num); |
| } |
| lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0]; |
| } |
| } |
| |
| // Sort the lemma items according to the hanzi, and give each unique item a |
| // id |
| sort_lemmas_by_hz(); |
| |
| scis_num_ = build_scis(); |
| |
| // Construct the dict list |
| dict_trie->dict_list_ = new DictList(); |
| bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_, |
| lemma_arr_, lemma_num_); |
| assert(dl_success); |
| |
| // Construct the NGram information |
| NGram& ngram = NGram::get_instance(); |
| ngram.build_unigram(lemma_arr_, lemma_num_, |
| lemma_arr_[lemma_num_ - 1].idx_by_hz + 1); |
| |
| // sort the lemma items according to the spelling idx string |
| myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py); |
| |
| get_top_lemmas(); |
| |
| #ifdef ___DO_STATISTICS___ |
| stat_init(); |
| #endif |
| |
| lma_nds_used_num_le0_ = 1; // The root node |
| bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_), |
| lemma_arr_, 0, lemma_num_, 0); |
| if (!dt_success) { |
| free_resource(); |
| return false; |
| } |
| |
| #ifdef ___DO_STATISTICS___ |
| stat_print(); |
| #endif |
| |
| // Move the node data and homo data to the DictTrie |
| dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_]; |
| dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_]; |
| size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_; |
| dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize]; |
| assert(NULL != dict_trie->root_); |
| assert(NULL != dict_trie->lma_idx_buf_); |
| dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_; |
| dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_; |
| dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize; |
| dict_trie->top_lmas_num_ = top_lmas_num_; |
| |
| memcpy(dict_trie->root_, lma_nodes_le0_, |
| sizeof(LmaNodeLE0) * lma_nds_used_num_le0_); |
| memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_, |
| sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_); |
| |
| for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) { |
| id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, |
| homo_idx_buf_[pos]); |
| } |
| |
| for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_; |
| pos < lma_idx_num; pos++) { |
| LemmaIdType idx = |
| top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz; |
| id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx); |
| } |
| |
| if (kPrintDebug0) { |
| printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_); |
| printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_); |
| printf("top_lmas_num_: %d\n", top_lmas_num_); |
| } |
| |
| free_resource(); |
| |
| if (kPrintDebug0) { |
| printf("Building dict succeds\n"); |
| } |
| return dt_success; |
| } |
| |
| void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) { |
| if (NULL == buf) return; |
| for (size_t pos = 0; pos < kLemmaIdSize; pos++) { |
| (buf)[pos] = (unsigned char)(id >> (pos * 8)); |
| } |
| } |
| |
| void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) { |
| node->son_1st_off_l = static_cast<uint16>(offset); |
| node->son_1st_off_h = static_cast<unsigned char>(offset >> 16); |
| } |
| |
| void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) { |
| node->homo_idx_buf_off_l = static_cast<uint16>(offset); |
| node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16); |
| |
| } |
| |
| // All spelling strings will be converted to upper case, except that |
| // spellings started with "ZH"/"CH"/"SH" will be converted to |
| // "Zh"/"Ch"/"Sh" |
| void DictBuilder::format_spelling_str(char *spl_str) { |
| if (NULL == spl_str) |
| return; |
| |
| uint16 pos = 0; |
| while ('\0' != spl_str[pos]) { |
| if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') |
| spl_str[pos] = spl_str[pos] - 'a' + 'A'; |
| |
| if (1 == pos && 'H' == spl_str[pos]) { |
| if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) { |
| spl_str[pos] = 'h'; |
| } |
| } |
| pos++; |
| } |
| } |
| |
| LemmaIdType DictBuilder::sort_lemmas_by_hz() { |
| if (NULL == lemma_arr_ || 0 == lemma_num_) |
| return 0; |
| |
| myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs); |
| |
| lemma_arr_[0].idx_by_hz = 1; |
| LemmaIdType idx_max = 1; |
| for (size_t i = 1; i < lemma_num_; i++) { |
| if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) { |
| idx_max++; |
| lemma_arr_[i].idx_by_hz = idx_max; |
| } else { |
| idx_max++; |
| lemma_arr_[i].idx_by_hz = idx_max; |
| } |
| } |
| return idx_max + 1; |
| } |
| |
| size_t DictBuilder::build_scis() { |
| if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) |
| return 0; |
| |
| SpellingTrie &spl_trie = SpellingTrie::get_instance(); |
| |
| // This first one is blank, because id 0 is invalid. |
| scis_[0].freq = 0; |
| scis_[0].hz = 0; |
| scis_[0].splid.full_splid = 0; |
| scis_[0].splid.half_splid = 0; |
| scis_num_ = 1; |
| |
| // Copy the hanzis to the buffer |
| for (size_t pos = 0; pos < lemma_num_; pos++) { |
| size_t hz_num = lemma_arr_[pos].hz_str_len; |
| for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { |
| scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos]; |
| scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; |
| scis_[scis_num_].splid.half_splid = |
| spl_trie.full_to_half(scis_[scis_num_].splid.full_splid); |
| if (1 == hz_num) |
| scis_[scis_num_].freq = lemma_arr_[pos].freq; |
| else |
| scis_[scis_num_].freq = 0.000001; |
| scis_num_++; |
| } |
| } |
| |
| myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq); |
| |
| // Remove repeated items |
| size_t unique_scis_num = 1; |
| for (size_t pos = 1; pos < scis_num_; pos++) { |
| if (scis_[pos].hz == scis_[pos - 1].hz && |
| scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) |
| continue; |
| scis_[unique_scis_num] = scis_[pos]; |
| scis_[unique_scis_num].splid.half_splid = |
| spl_trie.full_to_half(scis_[pos].splid.full_splid); |
| unique_scis_num++; |
| } |
| |
| scis_num_ = unique_scis_num; |
| |
| // Update the lemma list. |
| for (size_t pos = 0; pos < lemma_num_; pos++) { |
| size_t hz_num = lemma_arr_[pos].hz_str_len; |
| for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { |
| SingleCharItem key; |
| key.hz = lemma_arr_[pos].hanzi_str[hzpos]; |
| key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; |
| key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid); |
| |
| SingleCharItem *found; |
| found = static_cast<SingleCharItem*>(mybsearch(&key, scis_, |
| unique_scis_num, |
| sizeof(SingleCharItem), |
| cmp_scis_hz_splid)); |
| |
| assert(found); |
| |
| lemma_arr_[pos].hanzi_scis_ids[hzpos] = |
| static_cast<uint16>(found - scis_); |
| lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid; |
| } |
| } |
| |
| return scis_num_; |
| } |
| |
| bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr, |
| size_t item_start, size_t item_end, |
| size_t level) { |
| if (level >= kMaxLemmaSize || item_end <= item_start) |
| return false; |
| |
| // 1. Scan for how many sons |
| size_t parent_son_num = 0; |
| // LemmaNode *son_1st = NULL; |
| // parent.num_of_son = 0; |
| |
| LemmaEntry *lma_last_start = lemma_arr_ + item_start; |
| uint16 spl_idx_node = lma_last_start->spl_idx_arr[level]; |
| |
| // Scan for how many sons to be allocaed |
| for (size_t i = item_start + 1; i< item_end; i++) { |
| LemmaEntry *lma_current = lemma_arr + i; |
| uint16 spl_idx_current = lma_current->spl_idx_arr[level]; |
| if (spl_idx_current != spl_idx_node) { |
| parent_son_num++; |
| spl_idx_node = spl_idx_current; |
| } |
| } |
| parent_son_num++; |
| |
| #ifdef ___DO_STATISTICS___ |
| // Use to indicate whether all nodes of this layer have no son. |
| bool allson_noson = true; |
| |
| assert(level < kMaxLemmaSize); |
| if (parent_son_num > max_sonbuf_len_[level]) |
| max_sonbuf_len_[level] = parent_son_num; |
| |
| total_son_num_[level] += parent_son_num; |
| total_sonbuf_num_[level] += 1; |
| |
| if (parent_son_num == 1) |
| sonbufs_num1_++; |
| else |
| sonbufs_numgt1_++; |
| total_lma_node_num_ += parent_son_num; |
| #endif |
| |
| // 2. Update the parent's information |
| // Update the parent's son list; |
| LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used |
| LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used. |
| if (0 == level) { // the parent is root |
| (static_cast<LmaNodeLE0*>(parent))->son_1st_off = |
| lma_nds_used_num_le0_; |
| son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_; |
| lma_nds_used_num_le0_ += parent_son_num; |
| |
| assert(parent_son_num <= 65535); |
| (static_cast<LmaNodeLE0*>(parent))->num_of_son = |
| static_cast<uint16>(parent_son_num); |
| } else if (1 == level) { // the parent is a son of root |
| (static_cast<LmaNodeLE0*>(parent))->son_1st_off = |
| lma_nds_used_num_ge1_; |
| son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; |
| lma_nds_used_num_ge1_ += parent_son_num; |
| |
| assert(parent_son_num <= 65535); |
| (static_cast<LmaNodeLE0*>(parent))->num_of_son = |
| static_cast<uint16>(parent_son_num); |
| } else { |
| set_son_offset((static_cast<LmaNodeGE1*>(parent)), |
| lma_nds_used_num_ge1_); |
| son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; |
| lma_nds_used_num_ge1_ += parent_son_num; |
| |
| assert(parent_son_num <= 255); |
| (static_cast<LmaNodeGE1*>(parent))->num_of_son = |
| (unsigned char)parent_son_num; |
| } |
| |
| // 3. Now begin to construct the son one by one |
| size_t son_pos = 0; |
| |
| lma_last_start = lemma_arr_ + item_start; |
| spl_idx_node = lma_last_start->spl_idx_arr[level]; |
| |
| size_t homo_num = 0; |
| if (lma_last_start->spl_idx_arr[level + 1] == 0) |
| homo_num = 1; |
| |
| size_t item_start_next = item_start; |
| |
| for (size_t i = item_start + 1; i < item_end; i++) { |
| LemmaEntry* lma_current = lemma_arr_ + i; |
| uint16 spl_idx_current = lma_current->spl_idx_arr[level]; |
| |
| if (spl_idx_current == spl_idx_node) { |
| if (lma_current->spl_idx_arr[level + 1] == 0) |
| homo_num++; |
| } else { |
| // Construct a node |
| LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid |
| LmaNodeGE1 *node_cur_ge1 = NULL; |
| if (0 == level) { |
| node_cur_le0 = son_1st_le0 + son_pos; |
| node_cur_le0->spl_idx = spl_idx_node; |
| node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; |
| node_cur_le0->son_1st_off = 0; |
| homo_idx_num_eq1_ += homo_num; |
| } else { |
| node_cur_ge1 = son_1st_ge1 + son_pos; |
| node_cur_ge1->spl_idx = spl_idx_node; |
| |
| set_homo_id_buf_offset(node_cur_ge1, |
| (homo_idx_num_eq1_ + homo_idx_num_gt1_)); |
| set_son_offset(node_cur_ge1, 0); |
| homo_idx_num_gt1_ += homo_num; |
| } |
| |
| if (homo_num > 0) { |
| LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + |
| homo_idx_num_gt1_ - homo_num; |
| if (0 == level) { |
| assert(homo_num <= 65535); |
| node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); |
| } else { |
| assert(homo_num <= 255); |
| node_cur_ge1->num_of_homo = (unsigned char)homo_num; |
| } |
| |
| for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { |
| idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz; |
| } |
| |
| #ifdef ___DO_STATISTICS___ |
| if (homo_num > max_homobuf_len_[level]) |
| max_homobuf_len_[level] = homo_num; |
| |
| total_homo_num_[level] += homo_num; |
| #endif |
| } |
| |
| if (i - item_start_next > homo_num) { |
| void *next_parent; |
| if (0 == level) |
| next_parent = static_cast<void*>(node_cur_le0); |
| else |
| next_parent = static_cast<void*>(node_cur_ge1); |
| construct_subset(next_parent, lemma_arr, |
| item_start_next + homo_num, i, level + 1); |
| #ifdef ___DO_STATISTICS___ |
| |
| total_node_hasson_[level] += 1; |
| allson_noson = false; |
| #endif |
| } |
| |
| // for the next son |
| lma_last_start = lma_current; |
| spl_idx_node = spl_idx_current; |
| item_start_next = i; |
| homo_num = 0; |
| if (lma_current->spl_idx_arr[level + 1] == 0) |
| homo_num = 1; |
| |
| son_pos++; |
| } |
| } |
| |
| // 4. The last one to construct |
| LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid |
| LmaNodeGE1 *node_cur_ge1 = NULL; |
| if (0 == level) { |
| node_cur_le0 = son_1st_le0 + son_pos; |
| node_cur_le0->spl_idx = spl_idx_node; |
| node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; |
| node_cur_le0->son_1st_off = 0; |
| homo_idx_num_eq1_ += homo_num; |
| } else { |
| node_cur_ge1 = son_1st_ge1 + son_pos; |
| node_cur_ge1->spl_idx = spl_idx_node; |
| |
| set_homo_id_buf_offset(node_cur_ge1, |
| (homo_idx_num_eq1_ + homo_idx_num_gt1_)); |
| set_son_offset(node_cur_ge1, 0); |
| homo_idx_num_gt1_ += homo_num; |
| } |
| |
| if (homo_num > 0) { |
| LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + |
| homo_idx_num_gt1_ - homo_num; |
| if (0 == level) { |
| assert(homo_num <= 65535); |
| node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); |
| } else { |
| assert(homo_num <= 255); |
| node_cur_ge1->num_of_homo = (unsigned char)homo_num; |
| } |
| |
| for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { |
| idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz; |
| } |
| |
| #ifdef ___DO_STATISTICS___ |
| if (homo_num > max_homobuf_len_[level]) |
| max_homobuf_len_[level] = homo_num; |
| |
| total_homo_num_[level] += homo_num; |
| #endif |
| } |
| |
| if (item_end - item_start_next > homo_num) { |
| void *next_parent; |
| if (0 == level) |
| next_parent = static_cast<void*>(node_cur_le0); |
| else |
| next_parent = static_cast<void*>(node_cur_ge1); |
| construct_subset(next_parent, lemma_arr, |
| item_start_next + homo_num, item_end, level + 1); |
| #ifdef ___DO_STATISTICS___ |
| |
| total_node_hasson_[level] += 1; |
| allson_noson = false; |
| #endif |
| } |
| |
| #ifdef ___DO_STATISTICS___ |
| if (allson_noson) { |
| total_sonbuf_allnoson_[level] += 1; |
| total_node_in_sonbuf_allnoson_[level] += parent_son_num; |
| } |
| #endif |
| |
| assert(son_pos + 1 == parent_son_num); |
| return true; |
| } |
| |
| #ifdef ___DO_STATISTICS___ |
| void DictBuilder::stat_init() { |
| memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); |
| memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize); |
| |
| sonbufs_num1_ = 0; |
| sonbufs_numgt1_ = 0; |
| total_lma_node_num_ = 0; |
| } |
| |
| void DictBuilder::stat_print() { |
| printf("\n------------STAT INFO-------------\n"); |
| printf("[root is layer -1]\n"); |
| printf(".. max_sonbuf_len per layer(from layer 0):\n "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", max_sonbuf_len_[i]); |
| printf("-, \n"); |
| |
| printf(".. max_homobuf_len per layer:\n -, "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", max_homobuf_len_[i]); |
| printf("\n"); |
| |
| printf(".. total_son_num per layer:\n "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", total_son_num_[i]); |
| printf("-, \n"); |
| |
| printf(".. total_node_hasson per layer:\n 1, "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", total_node_hasson_[i]); |
| printf("\n"); |
| |
| printf(".. total_sonbuf_num per layer:\n "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", total_sonbuf_num_[i]); |
| printf("-, \n"); |
| |
| printf(".. total_sonbuf_allnoson per layer:\n "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", total_sonbuf_allnoson_[i]); |
| printf("-, \n"); |
| |
| printf(".. total_node_in_sonbuf_allnoson per layer:\n "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", total_node_in_sonbuf_allnoson_[i]); |
| printf("-, \n"); |
| |
| printf(".. total_homo_num per layer:\n 0, "); |
| for (size_t i = 0; i < kMaxLemmaSize; i++) |
| printf("%d, ", total_homo_num_[i]); |
| printf("\n"); |
| |
| printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_); |
| printf(".. son buf allocation number with more than 1 son: %d\n", |
| sonbufs_numgt1_); |
| printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1); |
| } |
| #endif // ___DO_STATISTICS___ |
| |
| #endif // ___BUILD_MODEL___ |
| } // namespace ime_pinyin |