blob: 5064296ff572912cc57858c3b63c66eb02a6c4a2 [file] [log] [blame]
/*
* Copyright (C) 2009 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/userdict.h"
#include "../include/splparser.h"
#include "../include/ngram.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <cutils/log.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <assert.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/time.h>
#include <time.h>
#include <pthread.h>
#include <math.h>
namespace ime_pinyin {
#ifdef ___DEBUG_PERF___
static uint64 _ellapse_ = 0;
static struct timeval _tv_start_, _tv_end_;
#define DEBUG_PERF_BEGIN \
do { \
gettimeofday(&_tv_start_, NULL); \
} while(0)
#define DEBUG_PERF_END \
do { \
gettimeofday(&_tv_end_, NULL); \
_ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
(_tv_end_.tv_usec - _tv_start_.tv_usec); \
} while(0)
#define LOGD_PERF(message) \
LOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
#else
#define DEBUG_PERF_BEGIN
#define DEBUG_PERF_END
#define LOGD_PERF(message)
#endif
// XXX File load and write are thread-safe by g_mutex_
static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
static struct timeval g_last_update_ = {0, 0};
inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
return (4 + info->lemma_size + (info->lemma_count << 3)
#ifdef ___PREDICT_ENABLED___
+ (info->lemma_count << 2)
#endif
#ifdef ___SYNC_ENABLED___
+ (info->sync_count << 2)
#endif
+ sizeof(*info));
}
inline LmaScoreType UserDict::translate_score(int raw_score) {
// 1) ori_freq: original user frequency
uint32 ori_freq = extract_score_freq(raw_score);
// 2) lmt_off: lmt index (week offset for example)
uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
if (kUserDictLMTBitWidth < 16) {
uint64 mask = ~(1 << kUserDictLMTBitWidth);
lmt_off &= mask;
}
// 3) now_off: current time index (current week offset for example)
// assuming load_time_ is around current time
uint64 now_off = load_time_.tv_sec;
now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
now_off = (now_off << (64 - kUserDictLMTBitWidth));
now_off = (now_off >> (64 - kUserDictLMTBitWidth));
// 4) factor: decide expand-factor
int delta = now_off - lmt_off;
if (delta > 4)
delta = 4;
int factor = 80 - (delta << 4);
double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
* NGram::kLogValueAmplifier);
}
inline int UserDict::extract_score_freq(int raw_score) {
// Frequence stored in lowest 16 bits
int freq = (raw_score & 0x0000ffff);
return freq;
}
inline uint64 UserDict::extract_score_lmt(int raw_score) {
uint64 lmt = ((raw_score & 0xffff0000) >> 16);
if (kUserDictLMTBitWidth < 16) {
uint64 mask = ~(1 << kUserDictLMTBitWidth);
lmt &= mask;
}
lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
return lmt;
}
inline int UserDict::build_score(uint64 lmt, int freq) {
lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
lmt = (lmt << (64 - kUserDictLMTBitWidth));
lmt = (lmt >> (64 - kUserDictLMTBitWidth));
uint16 lmt16 = (uint16)lmt;
int s = freq;
s &= 0x0000ffff;
s = (lmt16 << 16) | s;
return s;
}
inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
int64 ret = 0;
if (len <= 0)
return ret;
int flag = 1;
const uint16 * endp = s + len;
if (*s == '-') {
flag = -1;
s++;
} else if (*s == '+') {
s++;
}
while (*s >= '0' && *s <= '9' && s < endp) {
ret += ret * 10 + (*s) - '0';
s++;
}
return ret * flag;
}
inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
if (!s || size <= 0)
return 0;
uint16 *endp = s + size;
int ret_len = 0;
if (v < 0) {
*(s++) = '-';
++ret_len;
v *= -1;
}
uint16 *b = s;
while (s < endp && v != 0) {
*(s++) = '0' + (v % 10);
v = v / 10;
++ret_len;
}
if (v != 0)
return 0;
--s;
while (b < s) {
*b = *s;
++b, --s;
}
return ret_len;
}
inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
offset &= kUserDictOffsetMask;
lemmas_[offset] |= flag;
}
inline char UserDict::get_lemma_flag(uint32 offset) {
offset &= kUserDictOffsetMask;
return (char)(lemmas_[offset]);
}
inline char UserDict::get_lemma_nchar(uint32 offset) {
offset &= kUserDictOffsetMask;
return (char)(lemmas_[offset + 1]);
}
inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
offset &= kUserDictOffsetMask;
return (uint16 *)(lemmas_ + offset + 2);
}
inline uint16 * UserDict::get_lemma_word(uint32 offset) {
offset &= kUserDictOffsetMask;
uint8 nchar = get_lemma_nchar(offset);
return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
}
inline LemmaIdType UserDict::get_max_lemma_id() {
// When a lemma is deleted, we don't not claim its id back for
// simplicity and performance
return start_id_ + dict_info_.lemma_count - 1;
}
inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
if (id >= start_id_ && id <= get_max_lemma_id())
return true;
return false;
}
inline bool UserDict::is_valid_state() {
if (state_ == USER_DICT_NONE)
return false;
return true;
}
UserDict::UserDict()
: start_id_(0),
version_(0),
lemmas_(NULL),
offsets_(NULL),
scores_(NULL),
ids_(NULL),
#ifdef ___PREDICT_ENABLED___
predicts_(NULL),
#endif
#ifdef ___SYNC_ENABLED___
syncs_(NULL),
sync_count_size_(0),
#endif
offsets_by_id_(NULL),
lemma_count_left_(0),
lemma_size_left_(0),
dict_file_(NULL),
state_(USER_DICT_NONE) {
memset(&dict_info_, 0, sizeof(dict_info_));
memset(&load_time_, 0, sizeof(load_time_));
#ifdef ___CACHE_ENABLED___
cache_init();
#endif
}
UserDict::~UserDict() {
close_dict();
}
bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
LemmaIdType end_id) {
#ifdef ___DEBUG_PERF___
DEBUG_PERF_BEGIN;
#endif
dict_file_ = strdup(file_name);
if (!dict_file_)
return false;
start_id_ = start_id;
if (false == validate(file_name) && false == reset(file_name)) {
goto error;
}
if (false == load(file_name, start_id)) {
goto error;
}
state_ = USER_DICT_SYNC;
gettimeofday(&load_time_, NULL);
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF("load_dict");
#endif
return true;
error:
free((void*)dict_file_);
start_id_ = 0;
return false;
}
bool UserDict::close_dict() {
if (state_ == USER_DICT_NONE)
return true;
if (state_ == USER_DICT_SYNC)
goto out;
// If dictionary is written back by others,
// we can not simply write back here
// To do a safe flush, we have to discard all newly added
// lemmas and try to reload dict file.
pthread_mutex_lock(&g_mutex_);
if (load_time_.tv_sec > g_last_update_.tv_sec ||
(load_time_.tv_sec == g_last_update_.tv_sec &&
load_time_.tv_usec > g_last_update_.tv_usec)) {
write_back();
gettimeofday(&g_last_update_, NULL);
}
pthread_mutex_unlock(&g_mutex_);
out:
free((void*)dict_file_);
free(lemmas_);
free(offsets_);
free(offsets_by_id_);
free(scores_);
free(ids_);
#ifdef ___PREDICT_ENABLED___
free(predicts_);
#endif
version_ = 0;
dict_file_ = NULL;
lemmas_ = NULL;
#ifdef ___SYNC_ENABLED___
syncs_ = NULL;
sync_count_size_ = 0;
#endif
offsets_ = NULL;
offsets_by_id_ = NULL;
scores_ = NULL;
ids_ = NULL;
#ifdef ___PREDICT_ENABLED___
predicts_ = NULL;
#endif
memset(&dict_info_, 0, sizeof(dict_info_));
lemma_count_left_ = 0;
lemma_size_left_ = 0;
state_ = USER_DICT_NONE;
return true;
}
size_t UserDict::number_of_lemmas() {
return dict_info_.lemma_count;
}
void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
return;
}
MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
const DictExtPara *dep,
LmaPsbItem *lpi_items,
size_t lpi_max, size_t *lpi_num) {
if (is_valid_state() == false)
return 0;
bool need_extend = false;
#ifdef ___DEBUG_PERF___
DEBUG_PERF_BEGIN;
#endif
*lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
lpi_items, lpi_max, &need_extend);
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF("extend_dict");
#endif
return ((*lpi_num > 0 || need_extend) ? 1 : 0);
}
int UserDict::is_fuzzy_prefix_spell_id(
const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
if (len1 < searchable->splids_len)
return 0;
SpellingTrie &spl_trie = SpellingTrie::get_instance();
uint32 i = 0;
for (i = 0; i < searchable->splids_len; i++) {
const char py1 = *spl_trie.get_spelling_str(id1[i]);
uint16 off = 8 * (i % 4);
const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
if (py1 == py2)
continue;
return 0;
}
return 1;
}
int UserDict::fuzzy_compare_spell_id(
const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
if (len1 < searchable->splids_len)
return -1;
if (len1 > searchable->splids_len)
return 1;
SpellingTrie &spl_trie = SpellingTrie::get_instance();
uint32 i = 0;
for (i = 0; i < len1; i++) {
const char py1 = *spl_trie.get_spelling_str(id1[i]);
uint16 off = 8 * (i % 4);
const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
if (py1 == py2)
continue;
if (py1 > py2)
return 1;
return -1;
}
return 0;
}
bool UserDict::is_prefix_spell_id(
const uint16 * fullids, uint16 fulllen,
const UserDictSearchable *searchable) {
if (fulllen < searchable->splids_len)
return false;
uint32 i = 0;
for (; i < searchable->splids_len; i++) {
uint16 start_id = searchable->splid_start[i];
uint16 count = searchable->splid_count[i];
if (fullids[i] >= start_id && fullids[i] < start_id + count)
continue;
else
return false;
}
return true;
}
bool UserDict::equal_spell_id(
const uint16 * fullids, uint16 fulllen,
const UserDictSearchable *searchable) {
if (fulllen != searchable->splids_len)
return false;
uint32 i = 0;
for (; i < fulllen; i++) {
uint16 start_id = searchable->splid_start[i];
uint16 count = searchable->splid_count[i];
if (fullids[i] >= start_id && fullids[i] < start_id + count)
continue;
else
return false;
}
return true;
}
int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
int32 begin = 0;
int32 end = dict_info_.lemma_count - 1;
int32 middle = -1;
int32 first_prefix = middle;
int32 last_matched = middle;
while (begin <= end) {
middle = (begin + end) >> 1;
uint32 offset = offsets_[middle];
uint8 nchar = get_lemma_nchar(offset);
const uint16 * splids = get_lemma_spell_ids(offset);
int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
if (pre)
first_prefix = middle;
if (cmp < 0) {
begin = middle + 1;
} else if (cmp > 0) {
end = middle - 1;
} else {
end = middle - 1;
last_matched = middle;
}
}
return first_prefix;
}
void UserDict::prepare_locate(UserDictSearchable *searchable,
const uint16 *splid_str,
uint16 splid_str_len) {
searchable->splids_len = splid_str_len;
memset(searchable->signature, 0, sizeof(searchable->signature));
SpellingTrie &spl_trie = SpellingTrie::get_instance();
uint32 i = 0;
for (; i < splid_str_len; i++) {
if (spl_trie.is_half_id(splid_str[i])) {
searchable->splid_count[i] =
spl_trie.half_to_full(splid_str[i],
&(searchable->splid_start[i]));
} else {
searchable->splid_count[i] = 1;
searchable->splid_start[i] = splid_str[i];
}
const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
searchable->signature[i>>2] |= (py << (8 * (i % 4)));
}
}
size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
LmaPsbItem *lpi_items, size_t lpi_max) {
return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
}
size_t UserDict::_get_lpis(const uint16 *splid_str,
uint16 splid_str_len, LmaPsbItem *lpi_items,
size_t lpi_max, bool * need_extend) {
bool tmp_extend;
if (!need_extend)
need_extend = &tmp_extend;
*need_extend = false;
if (is_valid_state() == false)
return 0;
if (lpi_max <= 0)
return 0;
if (0 == pthread_mutex_trylock(&g_mutex_)) {
if (load_time_.tv_sec < g_last_update_.tv_sec ||
(load_time_.tv_sec == g_last_update_.tv_sec &&
load_time_.tv_usec < g_last_update_.tv_usec)) {
// Others updated disk file, have to reload
pthread_mutex_unlock(&g_mutex_);
flush_cache();
} else {
pthread_mutex_unlock(&g_mutex_);
}
} else {
}
UserDictSearchable searchable;
prepare_locate(&searchable, splid_str, splid_str_len);
uint32 max_off = dict_info_.lemma_count;
#ifdef ___CACHE_ENABLED___
int32 middle;
uint32 start, count;
bool cached = cache_hit(&searchable, &start, &count);
if (cached) {
middle = start;
max_off = start + count;
} else {
middle = locate_first_in_offsets(&searchable);
start = middle;
}
#else
int32 middle = locate_first_in_offsets(&searchable);
#endif
if (middle == -1) {
#ifdef ___CACHE_ENABLED___
if (!cached)
cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
#endif
return 0;
}
size_t lpi_current = 0;
bool fuzzy_break = false;
bool prefix_break = false;
while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
if (lpi_current >= lpi_max)
break;
uint32 offset = offsets_[middle];
// Ignore deleted lemmas
if (offset & kUserDictOffsetFlagRemove) {
middle++;
continue;
}
uint8 nchar = get_lemma_nchar(offset);
uint16 * splids = get_lemma_spell_ids(offset);
#ifdef ___CACHE_ENABLED___
if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
#else
if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
#endif
fuzzy_break = true;
}
if (prefix_break == false) {
if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
if (*need_extend == false &&
is_prefix_spell_id(splids, nchar, &searchable)) {
*need_extend = true;
}
} else {
prefix_break = true;
}
}
if (equal_spell_id(splids, nchar, &searchable) == true) {
lpi_items[lpi_current].psb = translate_score(scores_[middle]);
lpi_items[lpi_current].id = ids_[middle];
lpi_items[lpi_current].lma_len = nchar;
lpi_current++;
}
middle++;
}
#ifdef ___CACHE_ENABLED___
if (!cached) {
count = middle - start;
cache_push(USER_DICT_CACHE, &searchable, start, count);
}
#endif
return lpi_current;
}
uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
uint16 str_max) {
if (is_valid_state() == false)
return 0;
if (is_valid_lemma_id(id_lemma) == false)
return 0;
uint32 offset = offsets_by_id_[id_lemma - start_id_];
uint8 nchar = get_lemma_nchar(offset);
char16 * str = get_lemma_word(offset);
uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
int i = 0;
for (; i < m; i++) {
str_buf[i] = str[i];
}
str_buf[i] = 0;
return m;
}
uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
uint16 splids_max, bool arg_valid) {
if (is_valid_lemma_id(id_lemma) == false)
return 0;
uint32 offset = offsets_by_id_[id_lemma - start_id_];
uint8 nchar = get_lemma_nchar(offset);
const uint16 * ids = get_lemma_spell_ids(offset);
int i = 0;
for (; i < nchar && i < splids_max; i++)
splids[i] = ids[i];
return i;
}
size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
NPredictItem *npre_items, size_t npre_max,
size_t b4_used) {
uint32 new_added = 0;
#ifdef ___PREDICT_ENABLED___
int32 end = dict_info_.lemma_count - 1;
int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
if (j == -1)
return 0;
while (j <= end) {
uint32 offset = predicts_[j];
// Ignore deleted lemmas
if (offset & kUserDictOffsetFlagRemove) {
j++;
continue;
}
uint32 nchar = get_lemma_nchar(offset);
uint16 * words = get_lemma_word(offset);
uint16 * splids = get_lemma_spell_ids(offset);
if (nchar <= hzs_len) {
j++;
continue;
}
if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
if (new_added >= npre_max) {
return new_added;
}
uint32 cpy_len =
(nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
- (hzs_len << 1);
npre_items[new_added].his_len = hzs_len;
npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
if ((cpy_len >> 1) < kMaxPredictSize) {
npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
}
new_added++;
} else {
break;
}
j++;
}
#endif
return new_added;
}
int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
uint16 lemma_len) {
int32 max_off = dict_info_.lemma_count;
UserDictSearchable searchable;
prepare_locate(&searchable, splid_str, lemma_len);
#ifdef ___CACHE_ENABLED___
int32 off;
uint32 start, count;
bool cached = load_cache(&searchable, &start, &count);
if (cached) {
off = start;
max_off = start + count;
} else {
off = locate_first_in_offsets(&searchable);
start = off;
}
#else
int32 off = locate_first_in_offsets(&searchable);
#endif
if (off == -1) {
return off;
}
while (off < max_off) {
uint32 offset = offsets_[off];
if (offset & kUserDictOffsetFlagRemove) {
off++;
continue;
}
uint16 * splids = get_lemma_spell_ids(offset);
#ifdef ___CACHE_ENABLED___
if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
break;
#else
if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
break;
#endif
if (equal_spell_id(splids, lemma_len, &searchable) == true) {
uint16 * str = get_lemma_word(offset);
uint32 i = 0;
for (i = 0; i < lemma_len; i++) {
if (str[i] == lemma_str[i])
continue;
break;
}
if (i < lemma_len) {
off++;
continue;
}
#ifdef ___CACHE_ENABLED___
// No need to save_cache here, since current function is invoked by
// put_lemma. It's rarely possible for a user input same lemma twice.
// That means first time user type a new lemma, it is newly added into
// user dictionary, then it's possible that user type the same lemma
// again.
// Another reason save_cache can not be invoked here is this function
// aborts when lemma is found, and it never knows the count.
#endif
return off;
}
off++;
}
return -1;
}
#ifdef ___PREDICT_ENABLED___
uint32 UserDict::locate_where_to_insert_in_predicts(
const uint16 * words, int lemma_len) {
int32 begin = 0;
int32 end = dict_info_.lemma_count - 1;
int32 middle = end;
uint32 last_matched = middle;
while (begin <= end) {
middle = (begin + end) >> 1;
uint32 offset = offsets_[middle];
uint8 nchar = get_lemma_nchar(offset);
const uint16 * ws = get_lemma_word(offset);
uint32 minl = nchar < lemma_len ? nchar : lemma_len;
uint32 k = 0;
int cmp = 0;
for (; k < minl; k++) {
if (ws[k] < words[k]) {
cmp = -1;
break;
} else if (ws[k] > words[k]) {
cmp = 1;
break;
}
}
if (cmp == 0) {
if (nchar < lemma_len)
cmp = -1;
else if (nchar > lemma_len)
cmp = 1;
}
if (cmp < 0) {
begin = middle + 1;
last_matched = middle;
} else if (cmp > 0) {
end = middle - 1;
} else {
end = middle - 1;
last_matched = middle;
}
}
return last_matched;
}
int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
int32 begin = 0;
int32 end = dict_info_.lemma_count - 1;
int32 middle = -1;
int32 last_matched = middle;
while (begin <= end) {
middle = (begin + end) >> 1;
uint32 offset = offsets_[middle];
uint8 nchar = get_lemma_nchar(offset);
const uint16 * ws = get_lemma_word(offset);
uint32 minl = nchar < lemma_len ? nchar : lemma_len;
uint32 k = 0;
int cmp = 0;
for (; k < minl; k++) {
if (ws[k] < words[k]) {
cmp = -1;
break;
} else if (ws[k] > words[k]) {
cmp = 1;
break;
}
}
if (cmp == 0) {
if (nchar >= lemma_len)
last_matched = middle;
if (nchar < lemma_len)
cmp = -1;
else if (nchar > lemma_len)
cmp = 1;
}
if (cmp < 0) {
begin = middle + 1;
} else if (cmp > 0) {
end = middle - 1;
} else {
end = middle - 1;
}
}
return last_matched;
}
#endif
LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
uint16 lemma_len) {
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
if (off == -1) {
return 0;
}
return ids_[off];
}
LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
if (is_valid_state() == false)
return 0;
if (is_valid_lemma_id(lemma_id) == false)
return 0;
return translate_score(_get_lemma_score(lemma_id));
}
LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
uint16 lemma_len) {
if (is_valid_state() == false)
return 0;
return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
}
int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
if (is_valid_state() == false)
return 0;
if (is_valid_lemma_id(lemma_id) == false)
return 0;
uint32 offset = offsets_by_id_[lemma_id - start_id_];
uint32 nchar = get_lemma_nchar(offset);
uint16 * spl = get_lemma_spell_ids(offset);
uint16 * wrd = get_lemma_word(offset);
int32 off = locate_in_offsets(wrd, spl, nchar);
if (off == -1) {
return 0;
}
return scores_[off];
}
int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
uint16 lemma_len) {
if (is_valid_state() == false)
return 0;
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
if (off == -1) {
return 0;
}
return scores_[off];
}
#ifdef ___SYNC_ENABLED___
void UserDict::remove_lemma_from_sync_list(uint32 offset) {
offset &= kUserDictOffsetMask;
uint32 i = 0;
for (; i < dict_info_.sync_count; i++) {
unsigned int off = (syncs_[i] & kUserDictOffsetMask);
if (off == offset)
break;
}
if (i < dict_info_.sync_count) {
syncs_[i] = syncs_[dict_info_.sync_count - 1];
dict_info_.sync_count--;
}
}
#endif
#ifdef ___PREDICT_ENABLED___
void UserDict::remove_lemma_from_predict_list(uint32 offset) {
offset &= kUserDictOffsetMask;
uint32 i = 0;
for (; i < dict_info_.lemma_count; i++) {
unsigned int off = (predicts_[i] & kUserDictOffsetMask);
if (off == offset) {
predicts_[i] |= kUserDictOffsetFlagRemove;
break;
}
}
}
#endif
bool UserDict::remove_lemma_by_offset_index(int offset_index) {
if (is_valid_state() == false)
return 0;
int32 off = offset_index;
if (off == -1) {
return false;
}
uint32 offset = offsets_[off];
uint32 nchar = get_lemma_nchar(offset);
offsets_[off] |= kUserDictOffsetFlagRemove;
#ifdef ___SYNC_ENABLED___
// Remove corresponding sync item
remove_lemma_from_sync_list(offset);
#endif
#ifdef ___PREDICT_ENABLED___
remove_lemma_from_predict_list(offset);
#endif
dict_info_.free_count++;
dict_info_.free_size += (2 + (nchar << 2));
if (state_ < USER_DICT_OFFSET_DIRTY)
state_ = USER_DICT_OFFSET_DIRTY;
return true;
}
bool UserDict::remove_lemma(LemmaIdType lemma_id) {
if (is_valid_state() == false)
return 0;
if (is_valid_lemma_id(lemma_id) == false)
return false;
uint32 offset = offsets_by_id_[lemma_id - start_id_];
uint32 nchar = get_lemma_nchar(offset);
uint16 * spl = get_lemma_spell_ids(offset);
uint16 * wrd = get_lemma_word(offset);
int32 off = locate_in_offsets(wrd, spl, nchar);
return remove_lemma_by_offset_index(off);
}
void UserDict::flush_cache() {
LemmaIdType start_id = start_id_;
const char * file = strdup(dict_file_);
if (!file)
return;
close_dict();
load_dict(file, start_id, kUserDictIdEnd);
free((void*)file);
#ifdef ___CACHE_ENABLED___
cache_init();
#endif
return;
}
bool UserDict::reset(const char *file) {
FILE *fp = fopen(file, "w+");
if (!fp) {
return false;
}
uint32 version = kUserDictVersion;
size_t wred = fwrite(&version, 1, 4, fp);
UserDictInfo info;
memset(&info, 0, sizeof(info));
// By default, no limitation for lemma count and size
// thereby, reclaim_ratio is never used
wred += fwrite(&info, 1, sizeof(info), fp);
if (wred != sizeof(info) + sizeof(version)) {
fclose(fp);
unlink(file);
return false;
}
fclose(fp);
return true;
}
bool UserDict::validate(const char *file) {
// b is ignored in POSIX compatible os including Linux
// while b is important flag for Windows to specify binary mode
FILE *fp = fopen(file, "rb");
if (!fp) {
return false;
}
size_t size;
size_t readed;
uint32 version;
UserDictInfo dict_info;
// validate
int err = fseek(fp, 0, SEEK_END);
if (err) {
goto error;
}
size = ftell(fp);
if (size < 4 + sizeof(dict_info)) {
goto error;
}
err = fseek(fp, 0, SEEK_SET);
if (err) {
goto error;
}
readed = fread(&version, 1, sizeof(version), fp);
if (readed < sizeof(version)) {
goto error;
}
if (version != kUserDictVersion) {
goto error;
}
err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
if (err) {
goto error;
}
readed = fread(&dict_info, 1, sizeof(dict_info), fp);
if (readed != sizeof(dict_info)) {
goto error;
}
if (size != get_dict_file_size(&dict_info)) {
goto error;
}
fclose(fp);
return true;
error:
fclose(fp);
return false;
}
bool UserDict::load(const char *file, LemmaIdType start_id) {
if (0 != pthread_mutex_trylock(&g_mutex_)) {
return false;
}
// b is ignored in POSIX compatible os including Linux
// while b is important flag for Windows to specify binary mode
FILE *fp = fopen(file, "rb");
if (!fp) {
pthread_mutex_unlock(&g_mutex_);
return false;
}
size_t readed, toread;
UserDictInfo dict_info;
uint8 *lemmas = NULL;
uint32 *offsets = NULL;
#ifdef ___SYNC_ENABLED___
uint32 *syncs = NULL;
#endif
uint32 *scores = NULL;
uint32 *ids = NULL;
uint32 *offsets_by_id = NULL;
#ifdef ___PREDICT_ENABLED___
uint32 *predicts = NULL;
#endif
size_t i;
int err;
err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
if (err) goto error;
readed = fread(&dict_info, 1, sizeof(dict_info), fp);
if (readed != sizeof(dict_info)) goto error;
lemmas = (uint8 *)malloc(
dict_info.lemma_size +
(kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
if (!lemmas) goto error;
offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
if (!offsets) goto error;
#ifdef ___PREDICT_ENABLED___
predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
if (!predicts) goto error;
#endif
#ifdef ___SYNC_ENABLED___
syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
if (!syncs) goto error;
#endif
scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
if (!scores) goto error;
ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
if (!ids) goto error;
offsets_by_id = (uint32 *)malloc(
(dict_info.lemma_count + kUserDictPreAlloc) << 2);
if (!offsets_by_id) goto error;
err = fseek(fp, 4, SEEK_SET);
if (err) goto error;
readed = 0;
while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
}
if (readed < dict_info.lemma_size)
goto error;
toread = (dict_info.lemma_count << 2);
readed = 0;
while (readed < toread && !ferror(fp) && !feof(fp)) {
readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
}
if (readed < toread)
goto error;
#ifdef ___PREDICT_ENABLED___
toread = (dict_info.lemma_count << 2);
readed = 0;
while (readed < toread && !ferror(fp) && !feof(fp)) {
readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
}
if (readed < toread)
goto error;
#endif
readed = 0;
while (readed < toread && !ferror(fp) && !feof(fp)) {
readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
}
if (readed < toread)
goto error;
#ifdef ___SYNC_ENABLED___
toread = (dict_info.sync_count << 2);
readed = 0;
while (readed < toread && !ferror(fp) && !feof(fp)) {
readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
}
if (readed < toread)
goto error;
#endif
for (i = 0; i < dict_info.lemma_count; i++) {
ids[i] = start_id + i;
offsets_by_id[i] = offsets[i];
}
lemmas_ = lemmas;
offsets_ = offsets;
#ifdef ___SYNC_ENABLED___
syncs_ = syncs;
sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
#endif
offsets_by_id_ = offsets_by_id;
scores_ = scores;
ids_ = ids;
#ifdef ___PREDICT_ENABLED___
predicts_ = predicts;
#endif
lemma_count_left_ = kUserDictPreAlloc;
lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
memcpy(&dict_info_, &dict_info, sizeof(dict_info));
state_ = USER_DICT_SYNC;
fclose(fp);
pthread_mutex_unlock(&g_mutex_);
return true;
error:
if (lemmas) free(lemmas);
if (offsets) free(offsets);
#ifdef ___SYNC_ENABLED___
if (syncs) free(syncs);
#endif
if (scores) free(scores);
if (ids) free(ids);
if (offsets_by_id) free(offsets_by_id);
#ifdef ___PREDICT_ENABLED___
if (predicts) free(predicts);
#endif
fclose(fp);
pthread_mutex_unlock(&g_mutex_);
return false;
}
void UserDict::write_back() {
// XXX write back is only allowed from close_dict due to thread-safe sake
if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
return;
int fd = open(dict_file_, O_WRONLY);
if (fd == -1)
return;
switch (state_) {
case USER_DICT_DEFRAGMENTED:
write_back_all(fd);
break;
case USER_DICT_LEMMA_DIRTY:
write_back_lemma(fd);
break;
case USER_DICT_OFFSET_DIRTY:
write_back_offset(fd);
break;
case USER_DICT_SCORE_DIRTY:
write_back_score(fd);
break;
#ifdef ___SYNC_ENABLED___
case USER_DICT_SYNC_DIRTY:
write_back_sync(fd);
break;
#endif
default:
break;
}
// It seems truncate is not need on Linux, Windows except Mac
// I am doing it here anyway for safety.
off_t cur = lseek(fd, 0, SEEK_CUR);
ftruncate(fd, cur);
close(fd);
state_ = USER_DICT_SYNC;
}
#ifdef ___SYNC_ENABLED___
void UserDict::write_back_sync(int fd) {
int err = lseek(fd, 4 + dict_info_.lemma_size
+ (dict_info_.lemma_count << 3)
#ifdef ___PREDICT_ENABLED___
+ (dict_info_.lemma_count << 2)
#endif
, SEEK_SET);
if (err == -1)
return;
write(fd, syncs_, dict_info_.sync_count << 2);
write(fd, &dict_info_, sizeof(dict_info_));
}
#endif
void UserDict::write_back_offset(int fd) {
int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
if (err == -1)
return;
write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
write(fd, syncs_, dict_info_.sync_count << 2);
#endif
write(fd, &dict_info_, sizeof(dict_info_));
}
void UserDict::write_back_score(int fd) {
int err = lseek(fd, 4 + dict_info_.lemma_size
+ (dict_info_.lemma_count << 2)
#ifdef ___PREDICT_ENABLED___
+ (dict_info_.lemma_count << 2)
#endif
, SEEK_SET);
if (err == -1)
return;
write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
write(fd, syncs_, dict_info_.sync_count << 2);
#endif
write(fd, &dict_info_, sizeof(dict_info_));
}
void UserDict::write_back_lemma(int fd) {
int err = lseek(fd, 4, SEEK_SET);
if (err == -1)
return;
// New lemmas are always appended, no need to write whole lemma block
size_t need_write = kUserDictPreAlloc *
(2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
if (err == -1)
return;
write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
write(fd, syncs_, dict_info_.sync_count << 2);
#endif
write(fd, &dict_info_, sizeof(dict_info_));
}
void UserDict::write_back_all(int fd) {
// XXX lemma_size is handled differently in writeall
// and writelemma. I update lemma_size and lemma_count in different
// places for these two cases. Should fix it to make it consistent.
int err = lseek(fd, 4, SEEK_SET);
if (err == -1)
return;
write(fd, lemmas_, dict_info_.lemma_size);
write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
write(fd, syncs_, dict_info_.sync_count << 2);
#endif
write(fd, &dict_info_, sizeof(dict_info_));
}
#ifdef ___CACHE_ENABLED___
bool UserDict::load_cache(UserDictSearchable *searchable,
uint32 *offset, uint32 *length) {
UserDictCache *cache = &caches_[searchable->splids_len - 1];
if (cache->head == cache->tail)
return false;
uint16 j, sig_len = kMaxLemmaSize / 4;
uint16 i = cache->head;
while (1) {
j = 0;
for (; j < sig_len; j++) {
if (cache->signatures[i][j] != searchable->signature[j])
break;
}
if (j < sig_len) {
i++;
if (i >= kUserDictCacheSize)
i -= kUserDictCacheSize;
if (i == cache->tail)
break;
continue;
}
*offset = cache->offsets[i];
*length = cache->lengths[i];
return true;
}
return false;
}
void UserDict::save_cache(UserDictSearchable *searchable,
uint32 offset, uint32 length) {
UserDictCache *cache = &caches_[searchable->splids_len - 1];
uint16 next = cache->tail;
cache->offsets[next] = offset;
cache->lengths[next] = length;
uint16 sig_len = kMaxLemmaSize / 4;
uint16 j = 0;
for (; j < sig_len; j++) {
cache->signatures[next][j] = searchable->signature[j];
}
if (++next >= kUserDictCacheSize) {
next -= kUserDictCacheSize;
}
if (next == cache->head) {
cache->head++;
if (cache->head >= kUserDictCacheSize) {
cache->head -= kUserDictCacheSize;
}
}
cache->tail = next;
}
void UserDict::reset_cache() {
memset(caches_, 0, sizeof(caches_));
}
bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
if (cache->head == cache->tail)
return false;
uint16 j, sig_len = kMaxLemmaSize / 4;
uint16 i = cache->head;
while (1) {
j = 0;
for (; j < sig_len; j++) {
if (cache->signatures[i][j] != searchable->signature[j])
break;
}
if (j < sig_len) {
i++;
if (i >= kUserDictMissCacheSize)
i -= kUserDictMissCacheSize;
if (i == cache->tail)
break;
continue;
}
return true;
}
return false;
}
void UserDict::save_miss_cache(UserDictSearchable *searchable) {
UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
uint16 next = cache->tail;
uint16 sig_len = kMaxLemmaSize / 4;
uint16 j = 0;
for (; j < sig_len; j++) {
cache->signatures[next][j] = searchable->signature[j];
}
if (++next >= kUserDictMissCacheSize) {
next -= kUserDictMissCacheSize;
}
if (next == cache->head) {
cache->head++;
if (cache->head >= kUserDictMissCacheSize) {
cache->head -= kUserDictMissCacheSize;
}
}
cache->tail = next;
}
void UserDict::reset_miss_cache() {
memset(miss_caches_, 0, sizeof(miss_caches_));
}
void UserDict::cache_init() {
reset_cache();
reset_miss_cache();
}
bool UserDict::cache_hit(UserDictSearchable *searchable,
uint32 *offset, uint32 *length) {
bool hit = load_miss_cache(searchable);
if (hit) {
*offset = 0;
*length = 0;
return true;
}
hit = load_cache(searchable, offset, length);
if (hit) {
return true;
}
return false;
}
void UserDict::cache_push(UserDictCacheType type,
UserDictSearchable *searchable,
uint32 offset, uint32 length) {
switch (type) {
case USER_DICT_MISS_CACHE:
save_miss_cache(searchable);
break;
case USER_DICT_CACHE:
save_cache(searchable, offset, length);
break;
default:
break;
}
}
#endif
void UserDict::defragment(void) {
#ifdef ___DEBUG_PERF___
DEBUG_PERF_BEGIN;
#endif
if (is_valid_state() == false)
return;
// Fixup offsets_, set REMOVE flag to lemma's flag if needed
size_t first_freed = 0;
size_t first_inuse = 0;
while (first_freed < dict_info_.lemma_count) {
// Find first freed offset
while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
first_freed < dict_info_.lemma_count) {
first_freed++;
}
if (first_freed < dict_info_.lemma_count) {
// Save REMOVE flag to lemma flag
int off = offsets_[first_freed];
set_lemma_flag(off, kUserDictLemmaFlagRemove);
} else {
break;
}
// Find first inuse offse after first_freed
first_inuse = first_freed + 1;
while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
(first_inuse < dict_info_.lemma_count)) {
// Save REMOVE flag to lemma flag
int off = offsets_[first_inuse];
set_lemma_flag(off, kUserDictLemmaFlagRemove);
first_inuse++;
}
if (first_inuse >= dict_info_.lemma_count) {
break;
}
// Swap offsets_
int tmp = offsets_[first_inuse];
offsets_[first_inuse] = offsets_[first_freed];
offsets_[first_freed] = tmp;
// Move scores_, no need to swap
tmp = scores_[first_inuse];
scores_[first_inuse] = scores_[first_freed];
scores_[first_freed] = tmp;
// Swap ids_
LemmaIdType tmpid = ids_[first_inuse];
ids_[first_inuse] = ids_[first_freed];
ids_[first_freed] = tmpid;
// Go on
first_freed++;
}
#ifdef ___PREDICT_ENABLED___
// Fixup predicts_
first_freed = 0;
first_inuse = 0;
while (first_freed < dict_info_.lemma_count) {
// Find first freed offset
while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
first_freed < dict_info_.lemma_count) {
first_freed++;
}
if (first_freed >= dict_info_.lemma_count)
break;
// Find first inuse offse after first_freed
first_inuse = first_freed + 1;
while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
&& (first_inuse < dict_info_.lemma_count)) {
first_inuse++;
}
if (first_inuse >= dict_info_.lemma_count) {
break;
}
// Swap offsets_
int tmp = predicts_[first_inuse];
predicts_[first_inuse] = predicts_[first_freed];
predicts_[first_freed] = tmp;
// Go on
first_freed++;
}
#endif
dict_info_.lemma_count = first_freed;
// Fixup lemmas_
size_t begin = 0;
size_t end = 0;
size_t dst = 0;
int total_size = dict_info_.lemma_size + lemma_size_left_;
int total_count = dict_info_.lemma_count + lemma_count_left_;
size_t real_size = total_size - lemma_size_left_;
while (dst < real_size) {
unsigned char flag = get_lemma_flag(dst);
unsigned char nchr = get_lemma_nchar(dst);
if ((flag & kUserDictLemmaFlagRemove) == 0) {
dst += nchr * 4 + 2;
continue;
}
break;
}
if (dst >= real_size)
return;
end = dst;
while (end < real_size) {
begin = end + get_lemma_nchar(end) * 4 + 2;
repeat:
// not used any more
if (begin >= real_size)
break;
unsigned char flag = get_lemma_flag(begin);
unsigned char nchr = get_lemma_nchar(begin);
if (flag & kUserDictLemmaFlagRemove) {
begin += nchr * 4 + 2;
goto repeat;
}
end = begin + nchr * 4 + 2;
while (end < real_size) {
unsigned char eflag = get_lemma_flag(end);
unsigned char enchr = get_lemma_nchar(end);
if ((eflag & kUserDictLemmaFlagRemove) == 0) {
end += enchr * 4 + 2;
continue;
}
break;
}
memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
for (size_t j = 0; j < dict_info_.lemma_count; j++) {
if (offsets_[j] >= begin && offsets_[j] < end) {
offsets_[j] -= (begin - dst);
offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
}
#ifdef ___PREDICT_ENABLED___
if (predicts_[j] >= begin && predicts_[j] < end) {
predicts_[j] -= (begin - dst);
}
#endif
}
#ifdef ___SYNC_ENABLED___
for (size_t j = 0; j < dict_info_.sync_count; j++) {
if (syncs_[j] >= begin && syncs_[j] < end) {
syncs_[j] -= (begin - dst);
}
}
#endif
dst += (end - begin);
}
dict_info_.free_count = 0;
dict_info_.free_size = 0;
dict_info_.lemma_size = dst;
lemma_size_left_ = total_size - dict_info_.lemma_size;
lemma_count_left_ = total_count - dict_info_.lemma_count;
// XXX Without following code,
// offsets_by_id_ is not reordered.
// That's to say, all removed lemmas' ids are not collected back.
// There may not be room for addition of new lemmas due to
// offsests_by_id_ reason, although lemma_size_left_ is fixed.
// By default, we do want defrag as fast as possible, because
// during defrag procedure, other peers can not write new lemmas
// to user dictionary file.
// XXX If write-back is invoked immediately after
// this defragment, no need to fix up following in-mem data.
for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
ids_[i] = start_id_ + i;
offsets_by_id_[i] = offsets_[i];
}
state_ = USER_DICT_DEFRAGMENTED;
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF("defragment");
#endif
}
#ifdef ___SYNC_ENABLED___
void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
if (is_valid_state() == false)
return;
if (end > dict_info_.sync_count)
end = dict_info_.sync_count;
memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
dict_info_.sync_count -= (end - start);
if (state_ < USER_DICT_SYNC_DIRTY)
state_ = USER_DICT_SYNC_DIRTY;
}
int UserDict::get_sync_count() {
if (is_valid_state() == false)
return 0;
return dict_info_.sync_count;
}
LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
uint16 lemma_len, uint16 count, uint64 lmt) {
int again = 0;
begin:
LemmaIdType id;
uint32 * syncs_bak = syncs_;
syncs_ = NULL;
id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
syncs_ = syncs_bak;
if (id == 0 && again == 0) {
if ((dict_info_.limit_lemma_count > 0 &&
dict_info_.lemma_count >= dict_info_.limit_lemma_count)
|| (dict_info_.limit_lemma_size > 0 &&
dict_info_.lemma_size + (2 + (lemma_len << 2))
> dict_info_.limit_lemma_size)) {
// XXX Always reclaim and defrag in sync code path
// sync thread is background thread and ok with heavy work
reclaim();
defragment();
flush_cache();
again = 1;
goto begin;
}
}
return id;
}
int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
int newly_added = 0;
SpellingParser * spl_parser = new SpellingParser();
if (!spl_parser) {
return 0;
}
#ifdef ___DEBUG_PERF___
DEBUG_PERF_BEGIN;
#endif
char16 *ptr = lemmas;
// Extract pinyin,words,frequence,last_mod_time
char16 * p = ptr, * py16 = ptr;
char16 * hz16 = NULL;
int py16_len = 0;
uint16 * splid = NULL;
int splid_size = 0;
int splid_len = 0;
int hz16_len = 0;
char16 * fr16 = NULL;
int fr16_len = 0;
while (p - ptr < len) {
// Pinyin
py16 = p;
splid_len = 0;
while (*p != 0x2c && (p - ptr) < len) {
if (*p == 0x20)
splid_len++;
p++;
}
splid_len++;
if (p - ptr == len)
break;
py16_len = p - py16;
if (splid_size < splid_len) {
char16 * tid = (char16*)realloc(splid, splid_len * 2);
if (!tid)
break;
splid = tid;
splid_size = splid_len * 2;
}
bool is_pre;
int splidl = spl_parser->splstr16_to_idxs_f(
py16, py16_len, splid, NULL, splid_size, is_pre);
if (splidl != splid_len)
break;
// Phrase
hz16 = ++p;
while (*p != 0x2c && (p - ptr) < len) {
p++;
}
hz16_len = p - hz16;
if (hz16_len != splid_len)
break;
// Frequency
fr16 = ++p;
fr16_len = 0;
while (*p != 0x2c && (p - ptr) < len) {
p++;
}
fr16_len = p - fr16;
uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
// Last modified time
fr16 = ++p;
fr16_len = 0;
while (*p != 0x3b && (p - ptr) < len) {
p++;
}
fr16_len = p - fr16;
uint64 last_mod = utf16le_atoll(fr16, fr16_len);
put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
newly_added++;
p++;
}
if (splid)
free(splid);
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
#endif
return newly_added;
}
int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
char16 * str, int size, int * count) {
int len = 0;
*count = 0;
int left_len = size;
if (is_valid_state() == false)
return len;
SpellingTrie * spl_trie = &SpellingTrie::get_instance();
if (!spl_trie) {
return 0;
}
uint32 i;
for (i = 0; i < dict_info_.sync_count; i++) {
int offset = syncs_[i];
uint32 nchar = get_lemma_nchar(offset);
uint16 *spl = get_lemma_spell_ids(offset);
uint16 *wrd = get_lemma_word(offset);
int score = _get_lemma_score(wrd, spl, nchar);
static char score_temp[32], *pscore_temp = score_temp;
static char16 temp[256], *ptemp = temp;
pscore_temp = score_temp;
ptemp = temp;
uint32 j;
// Add pinyin
for (j = 0; j < nchar; j++) {
int ret_len = spl_trie->get_spelling_str16(
spl[j], ptemp, temp + sizeof(temp) - ptemp);
if (ret_len <= 0)
break;
ptemp += ret_len;
if (ptemp < temp + sizeof(temp) - 1) {
*(ptemp++) = ' ';
} else {
j = 0;
break;
}
}
if (j < nchar) {
continue;
}
ptemp--;
if (ptemp < temp + sizeof(temp) - 1) {
*(ptemp++) = ',';
} else {
continue;
}
// Add phrase
for (j = 0; j < nchar; j++) {
if (ptemp < temp + sizeof(temp) - 1) {
*(ptemp++) = wrd[j];
} else {
break;
}
}
if (j < nchar) {
continue;
}
if (ptemp < temp + sizeof(temp) - 1) {
*(ptemp++) = ',';
} else {
continue;
}
// Add frequency
uint32 intf = extract_score_freq(score);
int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
if (ret_len <= 0)
continue;
ptemp += ret_len;
if (ptemp < temp + sizeof(temp) - 1) {
*(ptemp++) = ',';
} else {
continue;
}
// Add last modified time
uint64 last_mod = extract_score_lmt(score);
ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
if (ret_len <= 0)
continue;
ptemp += ret_len;
if (ptemp < temp + sizeof(temp) - 1) {
*(ptemp++) = ';';
} else {
continue;
}
// Write to string
int need_len = ptemp - temp;
if (need_len > left_len)
break;
memcpy(str + len, temp, need_len * 2);
left_len -= need_len;
len += need_len;
(*count)++;
}
if (len > 0) {
if (state_ < USER_DICT_SYNC_DIRTY)
state_ = USER_DICT_SYNC_DIRTY;
}
return len;
}
#endif
bool UserDict::state(UserDictStat * stat) {
if (is_valid_state() == false)
return false;
if (!stat)
return false;
stat->version = version_;
stat->file_name = dict_file_;
stat->load_time.tv_sec = load_time_.tv_sec;
stat->load_time.tv_usec = load_time_.tv_usec;
pthread_mutex_lock(&g_mutex_);
stat->last_update.tv_sec = g_last_update_.tv_sec;
stat->last_update.tv_usec = g_last_update_.tv_usec;
pthread_mutex_unlock(&g_mutex_);
stat->disk_size = get_dict_file_size(&dict_info_);
stat->lemma_count = dict_info_.lemma_count;
stat->lemma_size = dict_info_.lemma_size;
stat->delete_count = dict_info_.free_count;
stat->delete_size = dict_info_.free_size;
#ifdef ___SYNC_ENABLED___
stat->sync_count = dict_info_.sync_count;
#endif
stat->limit_lemma_count = dict_info_.limit_lemma_count;
stat->limit_lemma_size = dict_info_.limit_lemma_size;
stat->reclaim_ratio = dict_info_.reclaim_ratio;
return true;
}
void UserDict::set_limit(uint32 max_lemma_count,
uint32 max_lemma_size, uint32 reclaim_ratio) {
dict_info_.limit_lemma_count = max_lemma_count;
dict_info_.limit_lemma_size = max_lemma_size;
if (reclaim_ratio > 100)
reclaim_ratio = 100;
dict_info_.reclaim_ratio = reclaim_ratio;
}
void UserDict::reclaim() {
if (is_valid_state() == false)
return;
switch (dict_info_.reclaim_ratio) {
case 0:
return;
case 100:
// TODO: CLEAR to be implemented
assert(false);
return;
default:
break;
}
// XXX Reclaim is only based on count, not size
uint32 count = dict_info_.lemma_count;
int rc = count * dict_info_.reclaim_ratio / 100;
UserDictScoreOffsetPair * score_offset_pairs = NULL;
score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
sizeof(UserDictScoreOffsetPair) * rc);
if (score_offset_pairs == NULL) {
return;
}
for (int i = 0; i < rc; i++) {
int s = scores_[i];
score_offset_pairs[i].score = s;
score_offset_pairs[i].offset_index = i;
}
for (int i = (rc + 1) / 2; i >= 0; i--)
shift_down(score_offset_pairs, i, rc);
for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
int s = scores_[i];
if (s < score_offset_pairs[0].score) {
score_offset_pairs[0].score = s;
score_offset_pairs[0].offset_index = i;
shift_down(score_offset_pairs, 0, rc);
}
}
for (int i = 0; i < rc; i++) {
int off = score_offset_pairs[i].offset_index;
remove_lemma_by_offset_index(off);
}
if (rc > 0) {
if (state_ < USER_DICT_OFFSET_DIRTY)
state_ = USER_DICT_OFFSET_DIRTY;
}
free(score_offset_pairs);
}
inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
int s = sop[i].score;
int p = sop[i].offset_index;
sop[i].score = sop[j].score;
sop[i].offset_index = sop[j].offset_index;
sop[j].score = s;
sop[j].offset_index = p;
}
void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
int par = i;
while (par < n) {
int left = par * 2 + 1;
int right = left + 1;
if (left >= n && right >= n)
break;
if (right >= n) {
if (sop[left].score > sop[par].score) {
swap(sop, left, par);
par = left;
continue;
}
} else if (sop[left].score > sop[right].score &&
sop[left].score > sop[par].score) {
swap(sop, left, par);
par = left;
continue;
} else if (sop[right].score > sop[left].score &&
sop[right].score > sop[par].score) {
swap(sop, right, par);
par = right;
continue;
}
break;
}
}
LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
uint16 lemma_len, uint16 count) {
return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
}
LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
uint16 lemma_len, uint16 count, uint64 lmt) {
#ifdef ___DEBUG_PERF___
DEBUG_PERF_BEGIN;
#endif
if (is_valid_state() == false)
return 0;
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
if (off != -1) {
int delta_score = count - scores_[off];
dict_info_.total_nfreq += delta_score;
scores_[off] = build_score(lmt, count);
if (state_ < USER_DICT_SCORE_DIRTY)
state_ = USER_DICT_SCORE_DIRTY;
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF("_put_lemma(update)");
#endif
return ids_[off];
} else {
if ((dict_info_.limit_lemma_count > 0 &&
dict_info_.lemma_count >= dict_info_.limit_lemma_count)
|| (dict_info_.limit_lemma_size > 0 &&
dict_info_.lemma_size + (2 + (lemma_len << 2))
> dict_info_.limit_lemma_size)) {
// XXX Don't defragment here, it's too time-consuming.
return 0;
}
int flushed = 0;
if (lemma_count_left_ == 0 ||
lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
// XXX When there is no space for new lemma, we flush to disk
// flush_cache() may be called by upper user
// and better place shoule be found instead of here
flush_cache();
flushed = 1;
// Or simply return and do nothing
// return 0;
}
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
#endif
LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
#ifdef ___SYNC_ENABLED___
if (syncs_ && id != 0) {
queue_lemma_for_sync(id);
}
#endif
return id;
}
return 0;
}
#ifdef ___SYNC_ENABLED___
void UserDict::queue_lemma_for_sync(LemmaIdType id) {
if (dict_info_.sync_count < sync_count_size_) {
syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
} else {
uint32 * syncs = (uint32*)realloc(
syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
if (syncs) {
sync_count_size_ += kUserDictPreAlloc;
syncs_ = syncs;
syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
}
}
}
#endif
LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
bool selected) {
#ifdef ___DEBUG_PERF___
DEBUG_PERF_BEGIN;
#endif
if (is_valid_state() == false)
return 0;
if (is_valid_lemma_id(lemma_id) == false)
return 0;
uint32 offset = offsets_by_id_[lemma_id - start_id_];
uint8 lemma_len = get_lemma_nchar(offset);
char16 * lemma_str = get_lemma_word(offset);
uint16 * splids = get_lemma_spell_ids(offset);
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
if (off != -1) {
int score = scores_[off];
int count = extract_score_freq(score);
uint64 lmt = extract_score_lmt(score);
if (count + delta_count > kUserDictMaxFrequency ||
count + delta_count < count) {
delta_count = kUserDictMaxFrequency - count;
}
count += delta_count;
dict_info_.total_nfreq += delta_count;
if (selected) {
lmt = time(NULL);
}
scores_[off] = build_score(lmt, count);
if (state_ < USER_DICT_SCORE_DIRTY)
state_ = USER_DICT_SCORE_DIRTY;
#ifdef ___DEBUG_PERF___
DEBUG_PERF_END;
LOGD_PERF("update_lemma");
#endif
#ifdef ___SYNC_ENABLED___
queue_lemma_for_sync(ids_[off]);
#endif
return ids_[off];
}
return 0;
}
size_t UserDict::get_total_lemma_count() {
return dict_info_.total_nfreq;
}
void UserDict::set_total_lemma_count_of_others(size_t count) {
total_other_nfreq_ = count;
}
LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
uint16 lemma_len, uint16 count, uint64 lmt) {
LemmaIdType id = get_max_lemma_id() + 1;
size_t offset = dict_info_.lemma_size;
if (offset > kUserDictOffsetMask)
return 0;
lemmas_[offset] = 0;
lemmas_[offset + 1] = (uint8)lemma_len;
for (size_t i = 0; i < lemma_len; i++) {
*((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
*((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
= lemma_str[i];
}
uint32 off = dict_info_.lemma_count;
offsets_[off] = offset;
scores_[off] = build_score(lmt, count);
ids_[off] = id;
#ifdef ___PREDICT_ENABLED___
predicts_[off] = offset;
#endif
offsets_by_id_[id - start_id_] = offset;
dict_info_.lemma_count++;
dict_info_.lemma_size += (2 + (lemma_len << 2));
lemma_count_left_--;
lemma_size_left_ -= (2 + (lemma_len << 2));
// Sort
UserDictSearchable searchable;
prepare_locate(&searchable, splids, lemma_len);
size_t i = 0;
while (i < off) {
offset = offsets_[i];
uint32 nchar = get_lemma_nchar(offset);
uint16 * spl = get_lemma_spell_ids(offset);
if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
break;
i++;
}
if (i != off) {
uint32 temp = offsets_[off];
memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
offsets_[i] = temp;
temp = scores_[off];
memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
scores_[i] = temp;
temp = ids_[off];
memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
ids_[i] = temp;
}
#ifdef ___PREDICT_ENABLED___
uint32 j = 0;
uint16 * words_new = get_lemma_word(predicts_[off]);
j = locate_where_to_insert_in_predicts(words_new, lemma_len);
if (j != off) {
uint32 temp = predicts_[off];
memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
predicts_[j] = temp;
}
#endif
if (state_ < USER_DICT_LEMMA_DIRTY)
state_ = USER_DICT_LEMMA_DIRTY;
#ifdef ___CACHE_ENABLED___
cache_init();
#endif
dict_info_.total_nfreq += count;
return id;
}
}