| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef PINYINIME_INCLUDE_USERDICT_H__ |
| #define PINYINIME_INCLUDE_USERDICT_H__ |
| |
| #define ___CACHE_ENABLED___ |
| #define ___SYNC_ENABLED___ |
| #define ___PREDICT_ENABLED___ |
| |
| // Debug performance for operations |
| // #define ___DEBUG_PERF___ |
| |
| #include <pthread.h> |
| #include "atomdictbase.h" |
| |
| namespace ime_pinyin { |
| |
| class UserDict : public AtomDictBase { |
| public: |
| UserDict(); |
| ~UserDict(); |
| |
| bool load_dict(const char *file_name, LemmaIdType start_id, |
| LemmaIdType end_id); |
| |
| bool close_dict(); |
| |
| size_t number_of_lemmas(); |
| |
| void reset_milestones(uint16 from_step, MileStoneHandle from_handle); |
| |
| MileStoneHandle extend_dict(MileStoneHandle from_handle, |
| const DictExtPara *dep, LmaPsbItem *lpi_items, |
| size_t lpi_max, size_t *lpi_num); |
| |
| size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, |
| LmaPsbItem *lpi_items, size_t lpi_max); |
| |
| uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, |
| uint16 str_max); |
| |
| uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, |
| uint16 splids_max, bool arg_valid); |
| |
| size_t predict(const char16 last_hzs[], uint16 hzs_len, |
| NPredictItem *npre_items, size_t npre_max, |
| size_t b4_used); |
| |
| // Full spelling ids are required |
| LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len, uint16 count); |
| |
| LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, |
| bool selected); |
| |
| LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len); |
| |
| LmaScoreType get_lemma_score(LemmaIdType lemma_id); |
| |
| LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len); |
| |
| bool remove_lemma(LemmaIdType lemma_id); |
| |
| size_t get_total_lemma_count(); |
| void set_total_lemma_count_of_others(size_t count); |
| |
| void flush_cache(); |
| |
| void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, |
| uint32 reclaim_ratio); |
| |
| void reclaim(); |
| |
| void defragment(); |
| |
| #ifdef ___SYNC_ENABLED___ |
| void clear_sync_lemmas(unsigned int start, unsigned int end); |
| |
| int get_sync_count(); |
| |
| LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len, uint16 count, uint64 lmt); |
| /** |
| * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. |
| * |
| * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' |
| * @param len length of lemmas string in UTF-16LE |
| * @return newly added lemma count |
| */ |
| int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); |
| |
| /** |
| * Get lemmas need sync to a UTF-16LE string of above format. |
| * Note: input buffer (str) must not be too small. If str is too small to |
| * contain single one lemma, there might be a dead loop. |
| * |
| * @param str buffer to write lemmas |
| * @param size buffer size in UTF-16LE |
| * @param count output value of lemma returned |
| * @return UTF-16LE string length |
| */ |
| int get_sync_lemmas_in_utf16le_string_from_beginning( |
| char16 * str, int size, int * count); |
| |
| #endif |
| |
| struct UserDictStat { |
| uint32 version; |
| const char * file_name; |
| struct timeval load_time; |
| struct timeval last_update; |
| uint32 disk_size; |
| uint32 lemma_count; |
| uint32 lemma_size; |
| uint32 delete_count; |
| uint32 delete_size; |
| #ifdef ___SYNC_ENABLED___ |
| uint32 sync_count; |
| #endif |
| uint32 reclaim_ratio; |
| uint32 limit_lemma_count; |
| uint32 limit_lemma_size; |
| }; |
| |
| bool state(UserDictStat * stat); |
| |
| private: |
| uint32 total_other_nfreq_; |
| struct timeval load_time_; |
| LemmaIdType start_id_; |
| uint32 version_; |
| uint8 * lemmas_; |
| |
| // In-Memory-Only flag for each lemma |
| static const uint8 kUserDictLemmaFlagRemove = 1; |
| // Inuse lemmas' offset |
| uint32 * offsets_; |
| // Highest bit in offset tells whether corresponding lemma is removed |
| static const uint32 kUserDictOffsetFlagRemove = (1 << 31); |
| // Maximum possible for the offset |
| static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); |
| // Bit width for last modified time, from 1 to 16 |
| static const uint32 kUserDictLMTBitWidth = 16; |
| // Granularity for last modified time in second |
| static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; |
| // Maximum frequency count |
| static const uint16 kUserDictMaxFrequency = 0xFFFF; |
| |
| #define COARSE_UTC(year, month, day, hour, minute, second) \ |
| ( \ |
| (year - 1970) * 365 * 24 * 60 * 60 + \ |
| (month - 1) * 30 * 24 * 60 * 60 + \ |
| (day - 1) * 24 * 60 * 60 + \ |
| (hour - 0) * 60 * 60 + \ |
| (minute - 0) * 60 + \ |
| (second - 0) \ |
| ) |
| static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); |
| |
| // Correspond to offsets_ |
| uint32 * scores_; |
| // Following two fields are only valid in memory |
| uint32 * ids_; |
| #ifdef ___PREDICT_ENABLED___ |
| uint32 * predicts_; |
| #endif |
| #ifdef ___SYNC_ENABLED___ |
| uint32 * syncs_; |
| size_t sync_count_size_; |
| #endif |
| uint32 * offsets_by_id_; |
| |
| size_t lemma_count_left_; |
| size_t lemma_size_left_; |
| |
| const char * dict_file_; |
| |
| // Be sure size is 4xN |
| struct UserDictInfo { |
| // When limitation reached, how much percentage will be reclaimed (1 ~ 100) |
| uint32 reclaim_ratio; |
| // maximum lemma count, 0 means no limitation |
| uint32 limit_lemma_count; |
| // Maximum lemma size, it's different from |
| // whole disk file size or in-mem dict size |
| // 0 means no limitation |
| uint32 limit_lemma_size; |
| // Total lemma count including deleted and inuse |
| // Also indicate offsets_ size |
| uint32 lemma_count; |
| // Total size of lemmas including used and freed |
| uint32 lemma_size; |
| // Freed lemma count |
| uint32 free_count; |
| // Freed lemma size in byte |
| uint32 free_size; |
| #ifdef ___SYNC_ENABLED___ |
| uint32 sync_count; |
| #endif |
| int32 total_nfreq; |
| } dict_info_; |
| |
| static const uint32 kUserDictVersion = 0x0ABCDEF0; |
| |
| static const uint32 kUserDictPreAlloc = 32; |
| static const uint32 kUserDictAverageNchar = 8; |
| |
| enum UserDictState { |
| // Keep in order |
| USER_DICT_NONE = 0, |
| USER_DICT_SYNC, |
| #ifdef ___SYNC_ENABLED___ |
| USER_DICT_SYNC_DIRTY, |
| #endif |
| USER_DICT_SCORE_DIRTY, |
| USER_DICT_OFFSET_DIRTY, |
| USER_DICT_LEMMA_DIRTY, |
| |
| USER_DICT_DEFRAGMENTED, |
| } state_; |
| |
| struct UserDictSearchable { |
| uint16 splids_len; |
| uint16 splid_start[kMaxLemmaSize]; |
| uint16 splid_count[kMaxLemmaSize]; |
| // Compact inital letters for both FuzzyCompareSpellId and cache system |
| uint32 signature[kMaxLemmaSize / 4]; |
| }; |
| |
| #ifdef ___CACHE_ENABLED___ |
| enum UserDictCacheType { |
| USER_DICT_CACHE, |
| USER_DICT_MISS_CACHE, |
| }; |
| |
| static const int kUserDictCacheSize = 4; |
| static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; |
| |
| struct UserDictMissCache { |
| uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; |
| uint16 head, tail; |
| } miss_caches_[kMaxLemmaSize]; |
| |
| struct UserDictCache { |
| uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; |
| uint32 offsets[kUserDictCacheSize]; |
| uint32 lengths[kUserDictCacheSize]; |
| // Ring buffer |
| uint16 head, tail; |
| } caches_[kMaxLemmaSize]; |
| |
| void cache_init(); |
| |
| void cache_push(UserDictCacheType type, |
| UserDictSearchable *searchable, |
| uint32 offset, uint32 length); |
| |
| bool cache_hit(UserDictSearchable *searchable, |
| uint32 *offset, uint32 *length); |
| |
| bool load_cache(UserDictSearchable *searchable, |
| uint32 *offset, uint32 *length); |
| |
| void save_cache(UserDictSearchable *searchable, |
| uint32 offset, uint32 length); |
| |
| void reset_cache(); |
| |
| bool load_miss_cache(UserDictSearchable *searchable); |
| |
| void save_miss_cache(UserDictSearchable *searchable); |
| |
| void reset_miss_cache(); |
| #endif |
| |
| LmaScoreType translate_score(int f); |
| |
| int extract_score_freq(int raw_score); |
| |
| uint64 extract_score_lmt(int raw_score); |
| |
| inline int build_score(uint64 lmt, int freq); |
| |
| inline int64 utf16le_atoll(uint16 *s, int len); |
| |
| inline int utf16le_lltoa(int64 v, uint16 *s, int size); |
| |
| LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len, uint16 count, uint64 lmt); |
| |
| size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, |
| LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); |
| |
| int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); |
| |
| int _get_lemma_score(LemmaIdType lemma_id); |
| |
| int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, |
| const UserDictSearchable *searchable); |
| |
| bool is_prefix_spell_id(const uint16 * fullids, |
| uint16 fulllen, const UserDictSearchable *searchable); |
| |
| uint32 get_dict_file_size(UserDictInfo * info); |
| |
| bool reset(const char *file); |
| |
| bool validate(const char *file); |
| |
| bool load(const char *file, LemmaIdType start_id); |
| |
| bool is_valid_state(); |
| |
| bool is_valid_lemma_id(LemmaIdType id); |
| |
| LemmaIdType get_max_lemma_id(); |
| |
| void set_lemma_flag(uint32 offset, uint8 flag); |
| |
| char get_lemma_flag(uint32 offset); |
| |
| char get_lemma_nchar(uint32 offset); |
| |
| uint16 * get_lemma_spell_ids(uint32 offset); |
| |
| uint16 * get_lemma_word(uint32 offset); |
| |
| // Prepare searchable to fasten locate process |
| void prepare_locate(UserDictSearchable *searchable, |
| const uint16 * splids, uint16 len); |
| |
| // Compare initial letters only |
| int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, |
| const UserDictSearchable *searchable); |
| |
| // Compare exactly two spell ids |
| // First argument must be a full id spell id |
| bool equal_spell_id(const uint16 * fullids, |
| uint16 fulllen, const UserDictSearchable *searchable); |
| |
| // Find first item by initial letters |
| int32 locate_first_in_offsets(const UserDictSearchable *searchable); |
| |
| LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], |
| uint16 lemma_len, uint16 count, uint64 lmt); |
| |
| // Check if a lemma is in dictionary |
| int32 locate_in_offsets(char16 lemma_str[], |
| uint16 splid_str[], uint16 lemma_len); |
| |
| bool remove_lemma_by_offset_index(int offset_index); |
| #ifdef ___PREDICT_ENABLED___ |
| uint32 locate_where_to_insert_in_predicts(const uint16 * words, |
| int lemma_len); |
| |
| int32 locate_first_in_predicts(const uint16 * words, int lemma_len); |
| |
| void remove_lemma_from_predict_list(uint32 offset); |
| #endif |
| #ifdef ___SYNC_ENABLED___ |
| void queue_lemma_for_sync(LemmaIdType id); |
| |
| void remove_lemma_from_sync_list(uint32 offset); |
| |
| void write_back_sync(int fd); |
| #endif |
| void write_back_score(int fd); |
| void write_back_offset(int fd); |
| void write_back_lemma(int fd); |
| void write_back_all(int fd); |
| void write_back(); |
| |
| struct UserDictScoreOffsetPair { |
| int score; |
| uint32 offset_index; |
| }; |
| |
| inline void swap(UserDictScoreOffsetPair * sop, int i, int j); |
| |
| void shift_down(UserDictScoreOffsetPair * sop, int i, int n); |
| |
| // On-disk format for each lemma |
| // +-------------+ |
| // | Version (4) | |
| // +-------------+ |
| // +-----------+-----------+--------------------+-------------------+ |
| // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | |
| // +-----------+-----------+--------------------+-------------------+ |
| // ... |
| // +-----------------------+ +-------------+ <---Offset of offset |
| // | Offset1 by_splids (4) | ... | OffsetN (4) | |
| // +-----------------------+ +-------------+ |
| #ifdef ___PREDICT_ENABLED___ |
| // +----------------------+ +-------------+ |
| // | Offset1 by_lemma (4) | ... | OffsetN (4) | |
| // +----------------------+ +-------------+ |
| #endif |
| // +------------+ +------------+ |
| // | Score1 (4) | ... | ScoreN (4) | |
| // +------------+ +------------+ |
| #ifdef ___SYNC_ENABLED___ |
| // +-------------+ +-------------+ |
| // | NewAdd1 (4) | ... | NewAddN (4) | |
| // +-------------+ +-------------+ |
| #endif |
| // +----------------+ |
| // | Dict Info (4x) | |
| // +----------------+ |
| }; |
| } |
| |
| #endif |