diff --git a/command/pinyinime_dictbuilder.cpp b/command/pinyinime_dictbuilder.cpp index d5e0433..5c6786f 100644 --- a/command/pinyinime_dictbuilder.cpp +++ b/command/pinyinime_dictbuilder.cpp @@ -1,55 +1,59 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include "../src/include/dicttrie.h" - -using namespace ime_pinyin; - -/** - * Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined - * in dictdef.h. - */ -int main(int argc, char* argv[]) { - DictTrie* dict_trie = new DictTrie(); - bool success; - if (argc >= 3) - success = dict_trie->build_dict(argv[1], argv[2]); - else - success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt", "../data/valid_utf16.txt"); - - if (success) { - printf("Build dictionary successfully.\n"); - } else { - printf("Build dictionary unsuccessfully.\n"); - return -1; - } - - success = dict_trie->save_dict("./dict/dict_pinyin.dat"); - - if (success) { - printf("Save dictionary successfully.\n"); - } else { - printf("Save dictionary unsuccessfully.\n"); - return -1; - } - - return 0; +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif +#include "../src/include/dicttrie.h" + +using namespace ime_pinyin; + +/** + * Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined + * in dictdef.h. + */ +int main(int argc, char* argv[]) { + DictTrie* dict_trie = new DictTrie(); + bool success; + if (argc >= 3) + success = dict_trie->build_dict(argv[1], argv[2]); + else + success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt", "../data/valid_utf16.txt"); + + if (success) { + printf("Build dictionary successfully.\n"); + } else { + printf("Build dictionary unsuccessfully.\n"); + return -1; + } + + success = dict_trie->save_dict("./dict/dict_pinyin.dat"); + + if (success) { + printf("Save dictionary successfully.\n"); + } else { + printf("Save dictionary unsuccessfully.\n"); + return -1; + } + + return 0; } \ No newline at end of file diff --git a/command/lcompile.sh b/command/scripts/lcompile.sh old mode 100755 new mode 100644 similarity index 95% rename from command/lcompile.sh rename to command/scripts/lcompile.sh index bb146aa..b530bcd --- a/command/lcompile.sh +++ b/command/scripts/lcompile.sh @@ -1,23 +1,23 @@ -#!/bin/bash -currentDirectory=$(pwd) -cmakeListsPath="${currentDirectory}/CMakeLists.txt" - -if [ ! -f "$cmakeListsPath" ]; then - echo "No CMakeLists.txt in current directory, please check." - exit 1 -fi - -echo "Start generating and compiling..." - -buildFolderPath="./build" - -if [ ! -d "$buildFolderPath" ]; then - mkdir -p "$buildFolderPath" - echo "build folder created." -fi - -cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ - -if [ $? -eq 0 ]; then - cmake --build ./build/ --config DEBUG -fi +#!/bin/bash +currentDirectory=$(pwd) +cmakeListsPath="${currentDirectory}/CMakeLists.txt" + +if [ ! -f "$cmakeListsPath" ]; then + echo "No CMakeLists.txt in current directory, please check." + exit 1 +fi + +echo "Start generating and compiling..." + +buildFolderPath="./build" + +if [ ! -d "$buildFolderPath" ]; then + mkdir -p "$buildFolderPath" + echo "build folder created." +fi + +cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ + +if [ $? -eq 0 ]; then + cmake --build ./build/ --config DEBUG +fi diff --git a/llaunch.sh b/command/scripts/llaunch.sh old mode 100755 new mode 100644 similarity index 96% rename from llaunch.sh rename to command/scripts/llaunch.sh index 4518c0b..49e549a --- a/llaunch.sh +++ b/command/scripts/llaunch.sh @@ -1,45 +1,45 @@ -#!/bin/bash -currentDirectory=$(pwd) -cmakeListsPath="${currentDirectory}/CMakeLists.txt" - -if [ ! -f "$cmakeListsPath" ]; then - echo "No CMakeLists.txt in current directory, please check." - exit 1 -fi - -echo "Start generating and compiling..." - -buildFolderPath="./build" - -if [ ! -d "$buildFolderPath" ]; then - mkdir -p "$buildFolderPath" - echo "build folder created." -fi - -cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ - -if [ $? -eq 0 ]; then - cmake --build ./build/ --config DEBUG - if [ $? -eq 0 ]; then - content=$(<"./CMakeLists.txt") - exePath="" - while IFS= read -r line; do - if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then - pattern="\"([^\"]+)\"" - if [[ $line =~ $pattern ]]; then - contentInParentheses="${BASH_REMATCH[1]}" - result=($contentInParentheses) - exePath="./build/bin/${result[0]}" - echo "start running as follows..." - echo "==================================================" - fi - fi - done <<<"$content" - # execute the binary file - if [ -n "$exePath" ]; then - $exePath - else - echo "cannot find executable file path" - fi - fi -fi +#!/bin/bash +currentDirectory=$(pwd) +cmakeListsPath="${currentDirectory}/CMakeLists.txt" + +if [ ! -f "$cmakeListsPath" ]; then + echo "No CMakeLists.txt in current directory, please check." + exit 1 +fi + +echo "Start generating and compiling..." + +buildFolderPath="./build" + +if [ ! -d "$buildFolderPath" ]; then + mkdir -p "$buildFolderPath" + echo "build folder created." +fi + +cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ + +if [ $? -eq 0 ]; then + cmake --build ./build/ --config DEBUG + if [ $? -eq 0 ]; then + content=$(<"./CMakeLists.txt") + exePath="" + while IFS= read -r line; do + if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then + pattern="\"([^\"]+)\"" + if [[ $line =~ $pattern ]]; then + contentInParentheses="${BASH_REMATCH[1]}" + result=($contentInParentheses) + exePath="./build/bin/${result[0]}" + echo "start running as follows..." + echo "==================================================" + fi + fi + done <<<"$content" + # execute the binary file + if [ -n "$exePath" ]; then + $exePath + else + echo "cannot find executable file path" + fi + fi +fi diff --git a/lrun.sh b/command/scripts/lrun.sh old mode 100755 new mode 100644 similarity index 96% rename from lrun.sh rename to command/scripts/lrun.sh index 0c6264e..8dec0c3 --- a/lrun.sh +++ b/command/scripts/lrun.sh @@ -1,18 +1,18 @@ -content=$(<"./CMakeLists.txt") -exePath="" -while IFS= read -r line; do - if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then - pattern="\"([^\"]+)\"" - if [[ $line =~ $pattern ]]; then - contentInParentheses="${BASH_REMATCH[1]}" - result=($contentInParentheses) - exePath="./build/bin/${result[0]}" - fi - fi -done <<<"$content" - -if [ -n "$exePath" ]; then - $exePath -else - echo "cannot find executable file path" -fi +content=$(<"./CMakeLists.txt") +exePath="" +while IFS= read -r line; do + if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then + pattern="\"([^\"]+)\"" + if [[ $line =~ $pattern ]]; then + contentInParentheses="${BASH_REMATCH[1]}" + result=($contentInParentheses) + exePath="./build/bin/${result[0]}" + fi + fi +done <<<"$content" + +if [ -n "$exePath" ]; then + $exePath +else + echo "cannot find executable file path" +fi diff --git a/scripts/lcompile.ps1 b/scripts/lcompile.ps1 new file mode 100644 index 0000000..81d154f --- /dev/null +++ b/scripts/lcompile.ps1 @@ -0,0 +1,26 @@ +# generate compile to exe files +$currentDirectory = Get-Location +$cmakeListsPath = Join-Path -Path $currentDirectory -ChildPath "CMakeLists.txt" + +if (-not (Test-Path $cmakeListsPath)) +{ + Write-Host("No CMakeLists.txt in current directory, please check.") + return +} + +Write-Host "Start generating and compiling..." + +$buildFolderPath = ".\build" + +if (-not (Test-Path $buildFolderPath)) +{ + New-Item -ItemType Directory -Path $buildFolderPath | Out-Null + Write-Host "build folder created." +} + +cmake -G "Visual Studio 17 2022" -A x64 -S . -B ./build/ + +if ($LASTEXITCODE -eq 0) +{ + cmake --build ./build/ --config DEBUG +} diff --git a/lcompile.sh b/scripts/lcompile.sh old mode 100755 new mode 100644 similarity index 95% rename from lcompile.sh rename to scripts/lcompile.sh index bb146aa..b530bcd --- a/lcompile.sh +++ b/scripts/lcompile.sh @@ -1,23 +1,23 @@ -#!/bin/bash -currentDirectory=$(pwd) -cmakeListsPath="${currentDirectory}/CMakeLists.txt" - -if [ ! -f "$cmakeListsPath" ]; then - echo "No CMakeLists.txt in current directory, please check." - exit 1 -fi - -echo "Start generating and compiling..." - -buildFolderPath="./build" - -if [ ! -d "$buildFolderPath" ]; then - mkdir -p "$buildFolderPath" - echo "build folder created." -fi - -cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ - -if [ $? -eq 0 ]; then - cmake --build ./build/ --config DEBUG -fi +#!/bin/bash +currentDirectory=$(pwd) +cmakeListsPath="${currentDirectory}/CMakeLists.txt" + +if [ ! -f "$cmakeListsPath" ]; then + echo "No CMakeLists.txt in current directory, please check." + exit 1 +fi + +echo "Start generating and compiling..." + +buildFolderPath="./build" + +if [ ! -d "$buildFolderPath" ]; then + mkdir -p "$buildFolderPath" + echo "build folder created." +fi + +cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ + +if [ $? -eq 0 ]; then + cmake --build ./build/ --config DEBUG +fi diff --git a/scripts/llaunch.ps1 b/scripts/llaunch.ps1 new file mode 100644 index 0000000..4c23a29 --- /dev/null +++ b/scripts/llaunch.ps1 @@ -0,0 +1,45 @@ +# +# generate, compile and run exe files +# +function getExePathFromCMakeLists() { + $content = Get-Content -Raw -Path "./CMakeLists.txt" + $exePath = "" + foreach ($line in $content -split "`n") { + if ($line -match 'set\(MY_EXECUTABLE_NAME[^\"]*\"([^\"]+)\"') { + $exeName = $matches[1] + $exePath = "./build/bin/Debug/$exeName" + ".exe" + break + } + } + return $exePath +} + +$currentDirectory = Get-Location +$cmakeListsPath = Join-Path -Path $currentDirectory -ChildPath "CMakeLists.txt" + +if (-not (Test-Path $cmakeListsPath)) { + Write-Host("No CMakeLists.txt in current directory, please check.") + return +} + +Write-Host "Start generating and compiling..." + +$buildFolderPath = ".\build" + +if (-not (Test-Path $buildFolderPath)) { + New-Item -ItemType Directory -Path $buildFolderPath | Out-Null + Write-Host "build folder created." +} + +cmake -G "Visual Studio 17 2022" -A x64 -S . -B ./build/ + +if ($LASTEXITCODE -eq 0) { + cmake --build ./build/ --config DEBUG + if ($LASTEXITCODE -eq 0) { + $exePath = getExePathFromCMakeLists + Write-Host "start running as follows..." + Write-Host "==================================================" + Invoke-Expression $exePath + } +} + diff --git a/command/llaunch.sh b/scripts/llaunch.sh old mode 100755 new mode 100644 similarity index 96% rename from command/llaunch.sh rename to scripts/llaunch.sh index 4518c0b..49e549a --- a/command/llaunch.sh +++ b/scripts/llaunch.sh @@ -1,45 +1,45 @@ -#!/bin/bash -currentDirectory=$(pwd) -cmakeListsPath="${currentDirectory}/CMakeLists.txt" - -if [ ! -f "$cmakeListsPath" ]; then - echo "No CMakeLists.txt in current directory, please check." - exit 1 -fi - -echo "Start generating and compiling..." - -buildFolderPath="./build" - -if [ ! -d "$buildFolderPath" ]; then - mkdir -p "$buildFolderPath" - echo "build folder created." -fi - -cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ - -if [ $? -eq 0 ]; then - cmake --build ./build/ --config DEBUG - if [ $? -eq 0 ]; then - content=$(<"./CMakeLists.txt") - exePath="" - while IFS= read -r line; do - if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then - pattern="\"([^\"]+)\"" - if [[ $line =~ $pattern ]]; then - contentInParentheses="${BASH_REMATCH[1]}" - result=($contentInParentheses) - exePath="./build/bin/${result[0]}" - echo "start running as follows..." - echo "==================================================" - fi - fi - done <<<"$content" - # execute the binary file - if [ -n "$exePath" ]; then - $exePath - else - echo "cannot find executable file path" - fi - fi -fi +#!/bin/bash +currentDirectory=$(pwd) +cmakeListsPath="${currentDirectory}/CMakeLists.txt" + +if [ ! -f "$cmakeListsPath" ]; then + echo "No CMakeLists.txt in current directory, please check." + exit 1 +fi + +echo "Start generating and compiling..." + +buildFolderPath="./build" + +if [ ! -d "$buildFolderPath" ]; then + mkdir -p "$buildFolderPath" + echo "build folder created." +fi + +cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/ + +if [ $? -eq 0 ]; then + cmake --build ./build/ --config DEBUG + if [ $? -eq 0 ]; then + content=$(<"./CMakeLists.txt") + exePath="" + while IFS= read -r line; do + if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then + pattern="\"([^\"]+)\"" + if [[ $line =~ $pattern ]]; then + contentInParentheses="${BASH_REMATCH[1]}" + result=($contentInParentheses) + exePath="./build/bin/${result[0]}" + echo "start running as follows..." + echo "==================================================" + fi + fi + done <<<"$content" + # execute the binary file + if [ -n "$exePath" ]; then + $exePath + else + echo "cannot find executable file path" + fi + fi +fi diff --git a/scripts/lrun.ps1 b/scripts/lrun.ps1 new file mode 100644 index 0000000..1faa0ff --- /dev/null +++ b/scripts/lrun.ps1 @@ -0,0 +1,20 @@ +# +# run exe file that has already been compiled before +# +function getExePathFromCMakeLists() { + $content = Get-Content -Raw -Path "./CMakeLists.txt" + $exePath = "" + foreach ($line in $content -split "`n") { + if ($line -match 'set\(MY_EXECUTABLE_NAME[^\"]*\"([^\"]+)\"') { + $exeName = $matches[1] + $exePath = "./build/bin/Debug/$exeName" + ".exe" + break + } + } + return $exePath +} + +$exePath = getExePathFromCMakeLists +#Write-Host "start running as follows..." +#Write-Host "==================================================" +Invoke-Expression $exePath diff --git a/command/lrun.sh b/scripts/lrun.sh old mode 100755 new mode 100644 similarity index 96% rename from command/lrun.sh rename to scripts/lrun.sh index 0c6264e..8dec0c3 --- a/command/lrun.sh +++ b/scripts/lrun.sh @@ -1,18 +1,18 @@ -content=$(<"./CMakeLists.txt") -exePath="" -while IFS= read -r line; do - if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then - pattern="\"([^\"]+)\"" - if [[ $line =~ $pattern ]]; then - contentInParentheses="${BASH_REMATCH[1]}" - result=($contentInParentheses) - exePath="./build/bin/${result[0]}" - fi - fi -done <<<"$content" - -if [ -n "$exePath" ]; then - $exePath -else - echo "cannot find executable file path" -fi +content=$(<"./CMakeLists.txt") +exePath="" +while IFS= read -r line; do + if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then + pattern="\"([^\"]+)\"" + if [[ $line =~ $pattern ]]; then + contentInParentheses="${BASH_REMATCH[1]}" + result=($contentInParentheses) + exePath="./build/bin/${result[0]}" + fi + fi +done <<<"$content" + +if [ -n "$exePath" ]; then + $exePath +else + echo "cannot find executable file path" +fi diff --git a/src/include/userdict.h b/src/include/userdict.h index 32914fe..766e5e8 100644 --- a/src/include/userdict.h +++ b/src/include/userdict.h @@ -1,390 +1,396 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PINYINIME_INCLUDE_USERDICT_H__ -#define PINYINIME_INCLUDE_USERDICT_H__ - -#define ___CACHE_ENABLED___ -#define ___SYNC_ENABLED___ -#define ___PREDICT_ENABLED___ - -// Debug performance for operations -// #define ___DEBUG_PERF___ - -#include -#include "atomdictbase.h" - -namespace ime_pinyin { - -class UserDict : public AtomDictBase { - public: - UserDict(); - ~UserDict(); - - bool load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id); - - bool close_dict(); - - size_t number_of_lemmas(); - - void reset_milestones(uint16 from_step, MileStoneHandle from_handle); - - MileStoneHandle extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); - - size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max); - - uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); - - uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid); - - size_t predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used); - - // Full spelling ids are required - LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count); - - LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected); - - LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len); - - LmaScoreType get_lemma_score(LemmaIdType lemma_id); - - LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); - - bool remove_lemma(LemmaIdType lemma_id); - - size_t get_total_lemma_count(); - void set_total_lemma_count_of_others(size_t count); - - void flush_cache(); - - void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio); - - void reclaim(); - - void defragment(); - -#ifdef ___SYNC_ENABLED___ - void clear_sync_lemmas(unsigned int start, unsigned int end); - - int get_sync_count(); - - LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); - /** - * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. - * - * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' - * @param len length of lemmas string in UTF-16LE - * @return newly added lemma count - */ - int put_lemmas_no_sync_from_utf16le_string(char16 *lemmas, int len); - - /** - * Get lemmas need sync to a UTF-16LE string of above format. - * Note: input buffer (str) must not be too small. If str is too small to - * contain single one lemma, there might be a dead loop. - * - * @param str buffer to write lemmas - * @param size buffer size in UTF-16LE - * @param count output value of lemma returned - * @return UTF-16LE string length - */ - int get_sync_lemmas_in_utf16le_string_from_beginning(char16 *str, int size, int *count); - -#endif - - struct UserDictStat { - uint32 version; - const char *file_name; - struct timeval load_time; - struct timeval last_update; - uint32 disk_size; - uint32 lemma_count; - uint32 lemma_size; - uint32 delete_count; - uint32 delete_size; -#ifdef ___SYNC_ENABLED___ - uint32 sync_count; -#endif - uint32 reclaim_ratio; - uint32 limit_lemma_count; - uint32 limit_lemma_size; - }; - - bool state(UserDictStat *stat); - - private: - uint32 total_other_nfreq_; - struct timeval load_time_; - LemmaIdType start_id_; - uint32 version_; - uint8 *lemmas_; - - // In-Memory-Only flag for each lemma - static const uint8 kUserDictLemmaFlagRemove = 1; - // Inuse lemmas' offset - uint32 *offsets_; - // Highest bit in offset tells whether corresponding lemma is removed - static const uint32 kUserDictOffsetFlagRemove = (1 << 31); - // Maximum possible for the offset - static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); - // Bit width for last modified time, from 1 to 16 - static const uint32 kUserDictLMTBitWidth = 16; - // Granularity for last modified time in second - static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; - // Maximum frequency count - static const uint16 kUserDictMaxFrequency = 0xFFFF; - -#define COARSE_UTC(year, month, day, hour, minute, second) ((year - 1970) * 365 * 24 * 60 * 60 + (month - 1) * 30 * 24 * 60 * 60 + (day - 1) * 24 * 60 * 60 + (hour - 0) * 60 * 60 + (minute - 0) * 60 + (second - 0)) - static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); - - // Correspond to offsets_ - uint32 *scores_; - // Following two fields are only valid in memory - uint32 *ids_; -#ifdef ___PREDICT_ENABLED___ - uint32 *predicts_; -#endif -#ifdef ___SYNC_ENABLED___ - uint32 *syncs_; - size_t sync_count_size_; -#endif - uint32 *offsets_by_id_; - - size_t lemma_count_left_; - size_t lemma_size_left_; - - const char *dict_file_; - - // Be sure size is 4xN - struct UserDictInfo { - // When limitation reached, how much percentage will be reclaimed (1 ~ 100) - uint32 reclaim_ratio; - // maximum lemma count, 0 means no limitation - uint32 limit_lemma_count; - // Maximum lemma size, it's different from - // whole disk file size or in-mem dict size - // 0 means no limitation - uint32 limit_lemma_size; - // Total lemma count including deleted and inuse - // Also indicate offsets_ size - uint32 lemma_count; - // Total size of lemmas including used and freed - uint32 lemma_size; - // Freed lemma count - uint32 free_count; - // Freed lemma size in byte - uint32 free_size; -#ifdef ___SYNC_ENABLED___ - uint32 sync_count; -#endif - int32 total_nfreq; - } dict_info_; - - static const uint32 kUserDictVersion = 0x0ABCDEF0; - - static const uint32 kUserDictPreAlloc = 32; - static const uint32 kUserDictAverageNchar = 8; - - enum UserDictState { - // Keep in order - USER_DICT_NONE = 0, - USER_DICT_SYNC, -#ifdef ___SYNC_ENABLED___ - USER_DICT_SYNC_DIRTY, -#endif - USER_DICT_SCORE_DIRTY, - USER_DICT_OFFSET_DIRTY, - USER_DICT_LEMMA_DIRTY, - - USER_DICT_DEFRAGMENTED, - } state_; - - struct UserDictSearchable { - uint16 splids_len; - uint16 splid_start[kMaxLemmaSize]; - uint16 splid_count[kMaxLemmaSize]; - // Compact inital letters for both FuzzyCompareSpellId and cache system - uint32 signature[kMaxLemmaSize / 4]; - }; - -#ifdef ___CACHE_ENABLED___ - enum UserDictCacheType { - USER_DICT_CACHE, - USER_DICT_MISS_CACHE, - }; - - static const int kUserDictCacheSize = 4; - static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; - - struct UserDictMissCache { - uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; - uint16 head, tail; - } miss_caches_[kMaxLemmaSize]; - - struct UserDictCache { - uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; - uint32 offsets[kUserDictCacheSize]; - uint32 lengths[kUserDictCacheSize]; - // Ring buffer - uint16 head, tail; - } caches_[kMaxLemmaSize]; - - void cache_init(); - - void cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length); - - bool cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length); - - bool load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length); - - void save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length); - - void reset_cache(); - - bool load_miss_cache(UserDictSearchable *searchable); - - void save_miss_cache(UserDictSearchable *searchable); - - void reset_miss_cache(); -#endif - - LmaScoreType translate_score(int f); - - int extract_score_freq(int raw_score); - - uint64 extract_score_lmt(int raw_score); - - inline int build_score(uint64 lmt, int freq); - - inline int64 utf16le_atoll(uint16 *s, int len); - - inline int utf16le_lltoa(int64 v, uint16 *s, int size); - - LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); - - size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool *need_extend); - - int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); - - int _get_lemma_score(LemmaIdType lemma_id); - - int is_fuzzy_prefix_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable); - - bool is_prefix_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable); - - uint32 get_dict_file_size(UserDictInfo *info); - - bool reset(const char *file); - - bool validate(const char *file); - - bool load(const char *file, LemmaIdType start_id); - - bool is_valid_state(); - - bool is_valid_lemma_id(LemmaIdType id); - - LemmaIdType get_max_lemma_id(); - - void set_lemma_flag(uint32 offset, uint8 flag); - - char get_lemma_flag(uint32 offset); - - char get_lemma_nchar(uint32 offset); - - uint16 *get_lemma_spell_ids(uint32 offset); - - uint16 *get_lemma_word(uint32 offset); - - // Prepare searchable to fasten locate process - void prepare_locate(UserDictSearchable *searchable, const uint16 *splids, uint16 len); - - // Compare initial letters only - int32 fuzzy_compare_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable); - - // Compare exactly two spell ids - // First argument must be a full id spell id - bool equal_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable); - - // Find first item by initial letters - int32 locate_first_in_offsets(const UserDictSearchable *searchable); - - LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); - - // Check if a lemma is in dictionary - int32 locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len); - - bool remove_lemma_by_offset_index(int offset_index); -#ifdef ___PREDICT_ENABLED___ - uint32 locate_where_to_insert_in_predicts(const uint16 *words, int lemma_len); - - int32 locate_first_in_predicts(const uint16 *words, int lemma_len); - - void remove_lemma_from_predict_list(uint32 offset); -#endif -#ifdef ___SYNC_ENABLED___ - void queue_lemma_for_sync(LemmaIdType id); - - void remove_lemma_from_sync_list(uint32 offset); - - void write_back_sync(int fd); -#endif - void write_back_score(int fd); - void write_back_offset(int fd); - void write_back_lemma(int fd); - void write_back_all(int fd); - void write_back(); - - struct UserDictScoreOffsetPair { - int score; - uint32 offset_index; - }; - - inline void swap(UserDictScoreOffsetPair *sop, int i, int j); - - void shift_down(UserDictScoreOffsetPair *sop, int i, int n); - - // On-disk format for each lemma - // +-------------+ - // | Version (4) | - // +-------------+ - // +-----------+-----------+--------------------+-------------------+ - // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | - // +-----------+-----------+--------------------+-------------------+ - // ... - // +-----------------------+ +-------------+ <---Offset of offset - // | Offset1 by_splids (4) | ... | OffsetN (4) | - // +-----------------------+ +-------------+ -#ifdef ___PREDICT_ENABLED___ - // +----------------------+ +-------------+ - // | Offset1 by_lemma (4) | ... | OffsetN (4) | - // +----------------------+ +-------------+ -#endif - // +------------+ +------------+ - // | Score1 (4) | ... | ScoreN (4) | - // +------------+ +------------+ -#ifdef ___SYNC_ENABLED___ - // +-------------+ +-------------+ - // | NewAdd1 (4) | ... | NewAddN (4) | - // +-------------+ +-------------+ -#endif - // +----------------+ - // | Dict Info (4x) | - // +----------------+ -}; -} // namespace ime_pinyin - -#endif +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_USERDICT_H__ +#define PINYINIME_INCLUDE_USERDICT_H__ + +#define ___CACHE_ENABLED___ +#define ___SYNC_ENABLED___ +#define ___PREDICT_ENABLED___ + +// Debug performance for operations +// #define ___DEBUG_PERF___ + +#ifdef _WIN32 +#include +#include // timeval +#else +#include +#include +#endif +#include "atomdictbase.h" + +namespace ime_pinyin { + +class UserDict : public AtomDictBase { + public: + UserDict(); + ~UserDict(); + + bool load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id); + + bool close_dict(); + + size_t number_of_lemmas(); + + void reset_milestones(uint16 from_step, MileStoneHandle from_handle); + + MileStoneHandle extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); + + size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max); + + uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); + + uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid); + + size_t predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used); + + // Full spelling ids are required + LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count); + + LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected); + + LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len); + + LmaScoreType get_lemma_score(LemmaIdType lemma_id); + + LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); + + bool remove_lemma(LemmaIdType lemma_id); + + size_t get_total_lemma_count(); + void set_total_lemma_count_of_others(size_t count); + + void flush_cache(); + + void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio); + + void reclaim(); + + void defragment(); + +#ifdef ___SYNC_ENABLED___ + void clear_sync_lemmas(unsigned int start, unsigned int end); + + int get_sync_count(); + + LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); + /** + * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. + * + * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' + * @param len length of lemmas string in UTF-16LE + * @return newly added lemma count + */ + int put_lemmas_no_sync_from_utf16le_string(char16 *lemmas, int len); + + /** + * Get lemmas need sync to a UTF-16LE string of above format. + * Note: input buffer (str) must not be too small. If str is too small to + * contain single one lemma, there might be a dead loop. + * + * @param str buffer to write lemmas + * @param size buffer size in UTF-16LE + * @param count output value of lemma returned + * @return UTF-16LE string length + */ + int get_sync_lemmas_in_utf16le_string_from_beginning(char16 *str, int size, int *count); + +#endif + + struct UserDictStat { + uint32 version; + const char *file_name; + struct timeval load_time; + struct timeval last_update; + uint32 disk_size; + uint32 lemma_count; + uint32 lemma_size; + uint32 delete_count; + uint32 delete_size; +#ifdef ___SYNC_ENABLED___ + uint32 sync_count; +#endif + uint32 reclaim_ratio; + uint32 limit_lemma_count; + uint32 limit_lemma_size; + }; + + bool state(UserDictStat *stat); + + private: + uint32 total_other_nfreq_; + struct timeval load_time_; + LemmaIdType start_id_; + uint32 version_; + uint8 *lemmas_; + + // In-Memory-Only flag for each lemma + static const uint8 kUserDictLemmaFlagRemove = 1; + // Inuse lemmas' offset + uint32 *offsets_; + // Highest bit in offset tells whether corresponding lemma is removed + static const uint32 kUserDictOffsetFlagRemove = (1 << 31); + // Maximum possible for the offset + static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); + // Bit width for last modified time, from 1 to 16 + static const uint32 kUserDictLMTBitWidth = 16; + // Granularity for last modified time in second + static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; + // Maximum frequency count + static const uint16 kUserDictMaxFrequency = 0xFFFF; + +#define COARSE_UTC(year, month, day, hour, minute, second) ((year - 1970) * 365 * 24 * 60 * 60 + (month - 1) * 30 * 24 * 60 * 60 + (day - 1) * 24 * 60 * 60 + (hour - 0) * 60 * 60 + (minute - 0) * 60 + (second - 0)) + static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); + + // Correspond to offsets_ + uint32 *scores_; + // Following two fields are only valid in memory + uint32 *ids_; +#ifdef ___PREDICT_ENABLED___ + uint32 *predicts_; +#endif +#ifdef ___SYNC_ENABLED___ + uint32 *syncs_; + size_t sync_count_size_; +#endif + uint32 *offsets_by_id_; + + size_t lemma_count_left_; + size_t lemma_size_left_; + + const char *dict_file_; + + // Be sure size is 4xN + struct UserDictInfo { + // When limitation reached, how much percentage will be reclaimed (1 ~ 100) + uint32 reclaim_ratio; + // maximum lemma count, 0 means no limitation + uint32 limit_lemma_count; + // Maximum lemma size, it's different from + // whole disk file size or in-mem dict size + // 0 means no limitation + uint32 limit_lemma_size; + // Total lemma count including deleted and inuse + // Also indicate offsets_ size + uint32 lemma_count; + // Total size of lemmas including used and freed + uint32 lemma_size; + // Freed lemma count + uint32 free_count; + // Freed lemma size in byte + uint32 free_size; +#ifdef ___SYNC_ENABLED___ + uint32 sync_count; +#endif + int32 total_nfreq; + } dict_info_; + + static const uint32 kUserDictVersion = 0x0ABCDEF0; + + static const uint32 kUserDictPreAlloc = 32; + static const uint32 kUserDictAverageNchar = 8; + + enum UserDictState { + // Keep in order + USER_DICT_NONE = 0, + USER_DICT_SYNC, +#ifdef ___SYNC_ENABLED___ + USER_DICT_SYNC_DIRTY, +#endif + USER_DICT_SCORE_DIRTY, + USER_DICT_OFFSET_DIRTY, + USER_DICT_LEMMA_DIRTY, + + USER_DICT_DEFRAGMENTED, + } state_; + + struct UserDictSearchable { + uint16 splids_len; + uint16 splid_start[kMaxLemmaSize]; + uint16 splid_count[kMaxLemmaSize]; + // Compact inital letters for both FuzzyCompareSpellId and cache system + uint32 signature[kMaxLemmaSize / 4]; + }; + +#ifdef ___CACHE_ENABLED___ + enum UserDictCacheType { + USER_DICT_CACHE, + USER_DICT_MISS_CACHE, + }; + + static const int kUserDictCacheSize = 4; + static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; + + struct UserDictMissCache { + uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; + uint16 head, tail; + } miss_caches_[kMaxLemmaSize]; + + struct UserDictCache { + uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; + uint32 offsets[kUserDictCacheSize]; + uint32 lengths[kUserDictCacheSize]; + // Ring buffer + uint16 head, tail; + } caches_[kMaxLemmaSize]; + + void cache_init(); + + void cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length); + + bool cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length); + + bool load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length); + + void save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length); + + void reset_cache(); + + bool load_miss_cache(UserDictSearchable *searchable); + + void save_miss_cache(UserDictSearchable *searchable); + + void reset_miss_cache(); +#endif + + LmaScoreType translate_score(int f); + + int extract_score_freq(int raw_score); + + uint64 extract_score_lmt(int raw_score); + + inline int build_score(uint64 lmt, int freq); + + inline int64 utf16le_atoll(uint16 *s, int len); + + inline int utf16le_lltoa(int64 v, uint16 *s, int size); + + LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); + + size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool *need_extend); + + int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); + + int _get_lemma_score(LemmaIdType lemma_id); + + int is_fuzzy_prefix_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable); + + bool is_prefix_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable); + + uint32 get_dict_file_size(UserDictInfo *info); + + bool reset(const char *file); + + bool validate(const char *file); + + bool load(const char *file, LemmaIdType start_id); + + bool is_valid_state(); + + bool is_valid_lemma_id(LemmaIdType id); + + LemmaIdType get_max_lemma_id(); + + void set_lemma_flag(uint32 offset, uint8 flag); + + char get_lemma_flag(uint32 offset); + + char get_lemma_nchar(uint32 offset); + + uint16 *get_lemma_spell_ids(uint32 offset); + + uint16 *get_lemma_word(uint32 offset); + + // Prepare searchable to fasten locate process + void prepare_locate(UserDictSearchable *searchable, const uint16 *splids, uint16 len); + + // Compare initial letters only + int32 fuzzy_compare_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable); + + // Compare exactly two spell ids + // First argument must be a full id spell id + bool equal_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable); + + // Find first item by initial letters + int32 locate_first_in_offsets(const UserDictSearchable *searchable); + + LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); + + // Check if a lemma is in dictionary + int32 locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len); + + bool remove_lemma_by_offset_index(int offset_index); +#ifdef ___PREDICT_ENABLED___ + uint32 locate_where_to_insert_in_predicts(const uint16 *words, int lemma_len); + + int32 locate_first_in_predicts(const uint16 *words, int lemma_len); + + void remove_lemma_from_predict_list(uint32 offset); +#endif +#ifdef ___SYNC_ENABLED___ + void queue_lemma_for_sync(LemmaIdType id); + + void remove_lemma_from_sync_list(uint32 offset); + + void write_back_sync(int fd); +#endif + void write_back_score(int fd); + void write_back_offset(int fd); + void write_back_lemma(int fd); + void write_back_all(int fd); + void write_back(); + + struct UserDictScoreOffsetPair { + int score; + uint32 offset_index; + }; + + inline void swap(UserDictScoreOffsetPair *sop, int i, int j); + + void shift_down(UserDictScoreOffsetPair *sop, int i, int n); + + // On-disk format for each lemma + // +-------------+ + // | Version (4) | + // +-------------+ + // +-----------+-----------+--------------------+-------------------+ + // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | + // +-----------+-----------+--------------------+-------------------+ + // ... + // +-----------------------+ +-------------+ <---Offset of offset + // | Offset1 by_splids (4) | ... | OffsetN (4) | + // +-----------------------+ +-------------+ +#ifdef ___PREDICT_ENABLED___ + // +----------------------+ +-------------+ + // | Offset1 by_lemma (4) | ... | OffsetN (4) | + // +----------------------+ +-------------+ +#endif + // +------------+ +------------+ + // | Score1 (4) | ... | ScoreN (4) | + // +------------+ +------------+ +#ifdef ___SYNC_ENABLED___ + // +-------------+ +-------------+ + // | NewAdd1 (4) | ... | NewAddN (4) | + // +-------------+ +-------------+ +#endif + // +----------------+ + // | Dict Info (4x) | + // +----------------+ +}; +} // namespace ime_pinyin + +#endif diff --git a/src/share/userdict.cpp b/src/share/userdict.cpp index 5adc430..3e1a7f1 100644 --- a/src/share/userdict.cpp +++ b/src/share/userdict.cpp @@ -1,2063 +1,2112 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../include/userdict.h" -#include "../include/splparser.h" -#include "../include/ngram.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ime_pinyin { - -#ifdef ___DEBUG_PERF___ -static uint64 _ellapse_ = 0; -static struct timeval _tv_start_, _tv_end_; -#define DEBUG_PERF_BEGIN \ - do { \ - gettimeofday(&_tv_start_, NULL); \ - } while (0) -#define DEBUG_PERF_END \ - do { \ - gettimeofday(&_tv_end_, NULL); \ - _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + (_tv_end_.tv_usec - _tv_start_.tv_usec); \ - } while (0) -#define LOGD_PERF(message) ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_); -#else -#define DEBUG_PERF_BEGIN -#define DEBUG_PERF_END -#define LOGD_PERF(message) -#endif - -// XXX File load and write are thread-safe by g_mutex_ -static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER; -static struct timeval g_last_update_ = {0, 0}; - -inline uint32 UserDict::get_dict_file_size(UserDictInfo *info) { - return (4 + info->lemma_size + (info->lemma_count << 3) -#ifdef ___PREDICT_ENABLED___ - + (info->lemma_count << 2) -#endif -#ifdef ___SYNC_ENABLED___ - + (info->sync_count << 2) -#endif - + sizeof(*info)); -} - -inline LmaScoreType UserDict::translate_score(int raw_score) { - // 1) ori_freq: original user frequency - uint32 ori_freq = extract_score_freq(raw_score); - // 2) lmt_off: lmt index (week offset for example) - uint64 lmt_off = ((raw_score & 0xffff0000) >> 16); - if (kUserDictLMTBitWidth < 16) { - uint64 mask = ~(1 << kUserDictLMTBitWidth); - lmt_off &= mask; - } - // 3) now_off: current time index (current week offset for example) - // assuming load_time_ is around current time - uint64 now_off = load_time_.tv_sec; - now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity; - now_off = (now_off << (64 - kUserDictLMTBitWidth)); - now_off = (now_off >> (64 - kUserDictLMTBitWidth)); - // 4) factor: decide expand-factor - int delta = now_off - lmt_off; - if (delta > 4) delta = 4; - int factor = 80 - (delta << 4); - - double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_); - return (LmaScoreType)(log((double)factor * (double)ori_freq / tf) * NGram::kLogValueAmplifier); -} - -inline int UserDict::extract_score_freq(int raw_score) { - // Frequence stored in lowest 16 bits - int freq = (raw_score & 0x0000ffff); - return freq; -} - -inline uint64 UserDict::extract_score_lmt(int raw_score) { - uint64 lmt = ((raw_score & 0xffff0000) >> 16); - if (kUserDictLMTBitWidth < 16) { - uint64 mask = ~(1 << kUserDictLMTBitWidth); - lmt &= mask; - } - lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince; - return lmt; -} - -inline int UserDict::build_score(uint64 lmt, int freq) { - lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity; - lmt = (lmt << (64 - kUserDictLMTBitWidth)); - lmt = (lmt >> (64 - kUserDictLMTBitWidth)); - uint16 lmt16 = (uint16)lmt; - int s = freq; - s &= 0x0000ffff; - s = (lmt16 << 16) | s; - return s; -} - -inline int64 UserDict::utf16le_atoll(uint16 *s, int len) { - int64 ret = 0; - if (len <= 0) return ret; - - int flag = 1; - const uint16 *endp = s + len; - if (*s == '-') { - flag = -1; - s++; - } else if (*s == '+') { - s++; - } - - while (*s >= '0' && *s <= '9' && s < endp) { - ret += ret * 10 + (*s) - '0'; - s++; - } - return ret * flag; -} - -inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) { - if (!s || size <= 0) return 0; - uint16 *endp = s + size; - int ret_len = 0; - if (v < 0) { - *(s++) = '-'; - ++ret_len; - v *= -1; - } - - uint16 *b = s; - while (s < endp && v != 0) { - *(s++) = '0' + (v % 10); - v = v / 10; - ++ret_len; - } - - if (v != 0) return 0; - - --s; - - while (b < s) { - *b = *s; - ++b, --s; - } - - return ret_len; -} - -inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) { - offset &= kUserDictOffsetMask; - lemmas_[offset] |= flag; -} - -inline char UserDict::get_lemma_flag(uint32 offset) { - offset &= kUserDictOffsetMask; - return (char)(lemmas_[offset]); -} - -inline char UserDict::get_lemma_nchar(uint32 offset) { - offset &= kUserDictOffsetMask; - return (char)(lemmas_[offset + 1]); -} - -inline uint16 *UserDict::get_lemma_spell_ids(uint32 offset) { - offset &= kUserDictOffsetMask; - return (uint16 *)(lemmas_ + offset + 2); -} - -inline uint16 *UserDict::get_lemma_word(uint32 offset) { - offset &= kUserDictOffsetMask; - uint8 nchar = get_lemma_nchar(offset); - return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1)); -} - -inline LemmaIdType UserDict::get_max_lemma_id() { - // When a lemma is deleted, we don't not claim its id back for - // simplicity and performance - return start_id_ + dict_info_.lemma_count - 1; -} - -inline bool UserDict::is_valid_lemma_id(LemmaIdType id) { - if (id >= start_id_ && id <= get_max_lemma_id()) return true; - return false; -} - -inline bool UserDict::is_valid_state() { - if (state_ == USER_DICT_NONE) return false; - return true; -} - -UserDict::UserDict() - : start_id_(0), - version_(0), - lemmas_(NULL), - offsets_(NULL), - scores_(NULL), - ids_(NULL), -#ifdef ___PREDICT_ENABLED___ - predicts_(NULL), -#endif -#ifdef ___SYNC_ENABLED___ - syncs_(NULL), - sync_count_size_(0), -#endif - offsets_by_id_(NULL), - lemma_count_left_(0), - lemma_size_left_(0), - dict_file_(NULL), - state_(USER_DICT_NONE) { - memset(&dict_info_, 0, sizeof(dict_info_)); - memset(&load_time_, 0, sizeof(load_time_)); -#ifdef ___CACHE_ENABLED___ - cache_init(); -#endif -} - -UserDict::~UserDict() { close_dict(); } - -bool UserDict::load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - dict_file_ = strdup(file_name); - if (!dict_file_) return false; - - start_id_ = start_id; - - if (false == validate(file_name) && false == reset(file_name)) { - goto error; - } - if (false == load(file_name, start_id)) { - goto error; - } - - state_ = USER_DICT_SYNC; - - gettimeofday(&load_time_, NULL); - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("load_dict"); -#endif - return true; -error: - free((void *)dict_file_); - start_id_ = 0; - return false; -} - -bool UserDict::close_dict() { - if (state_ == USER_DICT_NONE) return true; - if (state_ == USER_DICT_SYNC) goto out; - - // If dictionary is written back by others, - // we can not simply write back here - // To do a safe flush, we have to discard all newly added - // lemmas and try to reload dict file. - pthread_mutex_lock(&g_mutex_); - if (load_time_.tv_sec > g_last_update_.tv_sec || (load_time_.tv_sec == g_last_update_.tv_sec && load_time_.tv_usec > g_last_update_.tv_usec)) { - write_back(); - gettimeofday(&g_last_update_, NULL); - } - pthread_mutex_unlock(&g_mutex_); - -out: - free((void *)dict_file_); - free(lemmas_); - free(offsets_); - free(offsets_by_id_); - free(scores_); - free(ids_); -#ifdef ___PREDICT_ENABLED___ - free(predicts_); -#endif - - version_ = 0; - dict_file_ = NULL; - lemmas_ = NULL; -#ifdef ___SYNC_ENABLED___ - syncs_ = NULL; - sync_count_size_ = 0; -#endif - offsets_ = NULL; - offsets_by_id_ = NULL; - scores_ = NULL; - ids_ = NULL; -#ifdef ___PREDICT_ENABLED___ - predicts_ = NULL; -#endif - - memset(&dict_info_, 0, sizeof(dict_info_)); - lemma_count_left_ = 0; - lemma_size_left_ = 0; - state_ = USER_DICT_NONE; - - return true; -} - -size_t UserDict::number_of_lemmas() { return dict_info_.lemma_count; } - -void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { return; } - -MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num) { - if (is_valid_state() == false) return 0; - - bool need_extend = false; - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1, lpi_items, lpi_max, &need_extend); -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("extend_dict"); -#endif - return ((*lpi_num > 0 || need_extend) ? 1 : 0); -} - -int UserDict::is_fuzzy_prefix_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable) { - if (len1 < searchable->splids_len) return 0; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - uint32 i = 0; - for (i = 0; i < searchable->splids_len; i++) { - const char py1 = *spl_trie.get_spelling_str(id1[i]); - uint16 off = 8 * (i % 4); - const char py2 = ((searchable->signature[i / 4] & (0xff << off)) >> off); - if (py1 == py2) continue; - return 0; - } - return 1; -} - -int UserDict::fuzzy_compare_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable) { - if (len1 < searchable->splids_len) return -1; - if (len1 > searchable->splids_len) return 1; - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - uint32 i = 0; - for (i = 0; i < len1; i++) { - const char py1 = *spl_trie.get_spelling_str(id1[i]); - uint16 off = 8 * (i % 4); - const char py2 = ((searchable->signature[i / 4] & (0xff << off)) >> off); - if (py1 == py2) continue; - if (py1 > py2) return 1; - return -1; - } - return 0; -} - -bool UserDict::is_prefix_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable) { - if (fulllen < searchable->splids_len) return false; - - uint32 i = 0; - for (; i < searchable->splids_len; i++) { - uint16 start_id = searchable->splid_start[i]; - uint16 count = searchable->splid_count[i]; - if (fullids[i] >= start_id && fullids[i] < start_id + count) - continue; - else - return false; - } - return true; -} - -bool UserDict::equal_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable) { - if (fulllen != searchable->splids_len) return false; - - uint32 i = 0; - for (; i < fulllen; i++) { - uint16 start_id = searchable->splid_start[i]; - uint16 count = searchable->splid_count[i]; - if (fullids[i] >= start_id && fullids[i] < start_id + count) - continue; - else - return false; - } - return true; -} - -int32 UserDict::locate_first_in_offsets(const UserDictSearchable *searchable) { - int32 begin = 0; - int32 end = dict_info_.lemma_count - 1; - int32 middle = -1; - - int32 first_prefix = middle; - int32 last_matched = middle; - - while (begin <= end) { - middle = (begin + end) >> 1; - uint32 offset = offsets_[middle]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 *splids = get_lemma_spell_ids(offset); - int cmp = fuzzy_compare_spell_id(splids, nchar, searchable); - int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable); - - if (pre) first_prefix = middle; - - if (cmp < 0) { - begin = middle + 1; - } else if (cmp > 0) { - end = middle - 1; - } else { - end = middle - 1; - last_matched = middle; - } - } - - return first_prefix; -} - -void UserDict::prepare_locate(UserDictSearchable *searchable, const uint16 *splid_str, uint16 splid_str_len) { - searchable->splids_len = splid_str_len; - memset(searchable->signature, 0, sizeof(searchable->signature)); - - SpellingTrie &spl_trie = SpellingTrie::get_instance(); - uint32 i = 0; - for (; i < splid_str_len; i++) { - if (spl_trie.is_half_id(splid_str[i])) { - searchable->splid_count[i] = spl_trie.half_to_full(splid_str[i], &(searchable->splid_start[i])); - } else { - searchable->splid_count[i] = 1; - searchable->splid_start[i] = splid_str[i]; - } - const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]); - searchable->signature[i >> 2] |= (py << (8 * (i % 4))); - } -} - -size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max) { return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL); } - -size_t UserDict::_get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool *need_extend) { - bool tmp_extend; - if (!need_extend) need_extend = &tmp_extend; - - *need_extend = false; - - if (is_valid_state() == false) return 0; - if (lpi_max <= 0) return 0; - - if (0 == pthread_mutex_trylock(&g_mutex_)) { - if (load_time_.tv_sec < g_last_update_.tv_sec || (load_time_.tv_sec == g_last_update_.tv_sec && load_time_.tv_usec < g_last_update_.tv_usec)) { - // Others updated disk file, have to reload - pthread_mutex_unlock(&g_mutex_); - flush_cache(); - } else { - pthread_mutex_unlock(&g_mutex_); - } - } else { - } - - UserDictSearchable searchable; - prepare_locate(&searchable, splid_str, splid_str_len); - - uint32 max_off = dict_info_.lemma_count; -#ifdef ___CACHE_ENABLED___ - int32 middle; - uint32 start, count; - bool cached = cache_hit(&searchable, &start, &count); - if (cached) { - middle = start; - max_off = start + count; - } else { - middle = locate_first_in_offsets(&searchable); - start = middle; - } -#else - int32 middle = locate_first_in_offsets(&searchable); -#endif - - if (middle == -1) { -#ifdef ___CACHE_ENABLED___ - if (!cached) cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0); -#endif - return 0; - } - - size_t lpi_current = 0; - - bool fuzzy_break = false; - bool prefix_break = false; - while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) { - if (lpi_current >= lpi_max) break; - uint32 offset = offsets_[middle]; - // Ignore deleted lemmas - if (offset & kUserDictOffsetFlagRemove) { - middle++; - continue; - } - uint8 nchar = get_lemma_nchar(offset); - uint16 *splids = get_lemma_spell_ids(offset); -#ifdef ___CACHE_ENABLED___ - if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { -#else - if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { -#endif - fuzzy_break = true; - } - - if (prefix_break == false) { - if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) { - if (*need_extend == false && is_prefix_spell_id(splids, nchar, &searchable)) { - *need_extend = true; - } - } else { - prefix_break = true; - } - } - - if (equal_spell_id(splids, nchar, &searchable) == true) { - lpi_items[lpi_current].psb = translate_score(scores_[middle]); - lpi_items[lpi_current].id = ids_[middle]; - lpi_items[lpi_current].lma_len = nchar; - lpi_current++; - } - middle++; - } - -#ifdef ___CACHE_ENABLED___ - if (!cached) { - count = middle - start; - cache_push(USER_DICT_CACHE, &searchable, start, count); - } -#endif - - return lpi_current; -} - -uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max) { - if (is_valid_state() == false) return 0; - if (is_valid_lemma_id(id_lemma) == false) return 0; - uint32 offset = offsets_by_id_[id_lemma - start_id_]; - uint8 nchar = get_lemma_nchar(offset); - char16 *str = get_lemma_word(offset); - uint16 m = nchar < str_max - 1 ? nchar : str_max - 1; - int i = 0; - for (; i < m; i++) { - str_buf[i] = str[i]; - } - str_buf[i] = 0; - return m; -} - -uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid) { - if (is_valid_lemma_id(id_lemma) == false) return 0; - uint32 offset = offsets_by_id_[id_lemma - start_id_]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 *ids = get_lemma_spell_ids(offset); - int i = 0; - for (; i < nchar && i < splids_max; i++) splids[i] = ids[i]; - return i; -} - -size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used) { - uint32 new_added = 0; -#ifdef ___PREDICT_ENABLED___ - int32 end = dict_info_.lemma_count - 1; - int j = locate_first_in_predicts((const uint16 *)last_hzs, hzs_len); - if (j == -1) return 0; - - while (j <= end) { - uint32 offset = predicts_[j]; - // Ignore deleted lemmas - if (offset & kUserDictOffsetFlagRemove) { - j++; - continue; - } - uint32 nchar = get_lemma_nchar(offset); - uint16 *words = get_lemma_word(offset); - uint16 *splids = get_lemma_spell_ids(offset); - - if (nchar <= hzs_len) { - j++; - continue; - } - - if (memcmp(words, last_hzs, hzs_len << 1) == 0) { - if (new_added >= npre_max) { - return new_added; - } - uint32 cpy_len = (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1)) - (hzs_len << 1); - npre_items[new_added].his_len = hzs_len; - npre_items[new_added].psb = get_lemma_score(words, splids, nchar); - memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len); - if ((cpy_len >> 1) < kMaxPredictSize) { - npre_items[new_added].pre_hzs[cpy_len >> 1] = 0; - } - new_added++; - } else { - break; - } - - j++; - } -#endif - return new_added; -} - -int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len) { - int32 max_off = dict_info_.lemma_count; - - UserDictSearchable searchable; - prepare_locate(&searchable, splid_str, lemma_len); -#ifdef ___CACHE_ENABLED___ - int32 off; - uint32 start, count; - bool cached = load_cache(&searchable, &start, &count); - if (cached) { - off = start; - max_off = start + count; - } else { - off = locate_first_in_offsets(&searchable); - start = off; - } -#else - int32 off = locate_first_in_offsets(&searchable); -#endif - - if (off == -1) { - return off; - } - - while (off < max_off) { - uint32 offset = offsets_[off]; - if (offset & kUserDictOffsetFlagRemove) { - off++; - continue; - } - uint16 *splids = get_lemma_spell_ids(offset); -#ifdef ___CACHE_ENABLED___ - if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) break; -#else - if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) break; -#endif - if (equal_spell_id(splids, lemma_len, &searchable) == true) { - uint16 *str = get_lemma_word(offset); - uint32 i = 0; - for (i = 0; i < lemma_len; i++) { - if (str[i] == lemma_str[i]) continue; - break; - } - if (i < lemma_len) { - off++; - continue; - } -#ifdef ___CACHE_ENABLED___ - // No need to save_cache here, since current function is invoked by - // put_lemma. It's rarely possible for a user input same lemma twice. - // That means first time user type a new lemma, it is newly added into - // user dictionary, then it's possible that user type the same lemma - // again. - // Another reason save_cache can not be invoked here is this function - // aborts when lemma is found, and it never knows the count. -#endif - return off; - } - off++; - } - - return -1; -} - -#ifdef ___PREDICT_ENABLED___ -uint32 UserDict::locate_where_to_insert_in_predicts(const uint16 *words, int lemma_len) { - int32 begin = 0; - int32 end = dict_info_.lemma_count - 1; - int32 middle = end; - - uint32 last_matched = middle; - - while (begin <= end) { - middle = (begin + end) >> 1; - uint32 offset = offsets_[middle]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 *ws = get_lemma_word(offset); - - uint32 minl = nchar < lemma_len ? nchar : lemma_len; - uint32 k = 0; - int cmp = 0; - - for (; k < minl; k++) { - if (ws[k] < words[k]) { - cmp = -1; - break; - } else if (ws[k] > words[k]) { - cmp = 1; - break; - } - } - if (cmp == 0) { - if (nchar < lemma_len) - cmp = -1; - else if (nchar > lemma_len) - cmp = 1; - } - - if (cmp < 0) { - begin = middle + 1; - last_matched = middle; - } else if (cmp > 0) { - end = middle - 1; - } else { - end = middle - 1; - last_matched = middle; - } - } - - return last_matched; -} - -int32 UserDict::locate_first_in_predicts(const uint16 *words, int lemma_len) { - int32 begin = 0; - int32 end = dict_info_.lemma_count - 1; - int32 middle = -1; - - int32 last_matched = middle; - - while (begin <= end) { - middle = (begin + end) >> 1; - uint32 offset = offsets_[middle]; - uint8 nchar = get_lemma_nchar(offset); - const uint16 *ws = get_lemma_word(offset); - - uint32 minl = nchar < lemma_len ? nchar : lemma_len; - uint32 k = 0; - int cmp = 0; - - for (; k < minl; k++) { - if (ws[k] < words[k]) { - cmp = -1; - break; - } else if (ws[k] > words[k]) { - cmp = 1; - break; - } - } - if (cmp == 0) { - if (nchar >= lemma_len) last_matched = middle; - if (nchar < lemma_len) - cmp = -1; - else if (nchar > lemma_len) - cmp = 1; - } - - if (cmp < 0) { - begin = middle + 1; - } else if (cmp > 0) { - end = middle - 1; - } else { - end = middle - 1; - } - } - - return last_matched; -} - -#endif - -LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off == -1) { - return 0; - } - - return ids_[off]; -} - -LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) { - if (is_valid_state() == false) return 0; - if (is_valid_lemma_id(lemma_id) == false) return 0; - - return translate_score(_get_lemma_score(lemma_id)); -} - -LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { - if (is_valid_state() == false) return 0; - return translate_score(_get_lemma_score(lemma_str, splids, lemma_len)); -} - -int UserDict::_get_lemma_score(LemmaIdType lemma_id) { - if (is_valid_state() == false) return 0; - if (is_valid_lemma_id(lemma_id) == false) return 0; - - uint32 offset = offsets_by_id_[lemma_id - start_id_]; - - uint32 nchar = get_lemma_nchar(offset); - uint16 *spl = get_lemma_spell_ids(offset); - uint16 *wrd = get_lemma_word(offset); - - int32 off = locate_in_offsets(wrd, spl, nchar); - if (off == -1) { - return 0; - } - - return scores_[off]; -} - -int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { - if (is_valid_state() == false) return 0; - - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off == -1) { - return 0; - } - - return scores_[off]; -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::remove_lemma_from_sync_list(uint32 offset) { - offset &= kUserDictOffsetMask; - uint32 i = 0; - for (; i < dict_info_.sync_count; i++) { - unsigned int off = (syncs_[i] & kUserDictOffsetMask); - if (off == offset) break; - } - if (i < dict_info_.sync_count) { - syncs_[i] = syncs_[dict_info_.sync_count - 1]; - dict_info_.sync_count--; - } -} -#endif - -#ifdef ___PREDICT_ENABLED___ -void UserDict::remove_lemma_from_predict_list(uint32 offset) { - offset &= kUserDictOffsetMask; - uint32 i = 0; - for (; i < dict_info_.lemma_count; i++) { - unsigned int off = (predicts_[i] & kUserDictOffsetMask); - if (off == offset) { - predicts_[i] |= kUserDictOffsetFlagRemove; - break; - } - } -} -#endif - -bool UserDict::remove_lemma_by_offset_index(int offset_index) { - if (is_valid_state() == false) return 0; - - int32 off = offset_index; - if (off == -1) { - return false; - } - - uint32 offset = offsets_[off]; - uint32 nchar = get_lemma_nchar(offset); - - offsets_[off] |= kUserDictOffsetFlagRemove; - -#ifdef ___SYNC_ENABLED___ - // Remove corresponding sync item - remove_lemma_from_sync_list(offset); -#endif - -#ifdef ___PREDICT_ENABLED___ - remove_lemma_from_predict_list(offset); -#endif - dict_info_.free_count++; - dict_info_.free_size += (2 + (nchar << 2)); - - if (state_ < USER_DICT_OFFSET_DIRTY) state_ = USER_DICT_OFFSET_DIRTY; - return true; -} - -bool UserDict::remove_lemma(LemmaIdType lemma_id) { - if (is_valid_state() == false) return 0; - if (is_valid_lemma_id(lemma_id) == false) return false; - uint32 offset = offsets_by_id_[lemma_id - start_id_]; - - uint32 nchar = get_lemma_nchar(offset); - uint16 *spl = get_lemma_spell_ids(offset); - uint16 *wrd = get_lemma_word(offset); - - int32 off = locate_in_offsets(wrd, spl, nchar); - - return remove_lemma_by_offset_index(off); -} - -void UserDict::flush_cache() { - LemmaIdType start_id = start_id_; - const char *file = strdup(dict_file_); - if (!file) return; - close_dict(); - load_dict(file, start_id, kUserDictIdEnd); - free((void *)file); -#ifdef ___CACHE_ENABLED___ - cache_init(); -#endif - return; -} - -bool UserDict::reset(const char *file) { - FILE *fp = fopen(file, "w+"); - if (!fp) { - return false; - } - uint32 version = kUserDictVersion; - size_t wred = fwrite(&version, 1, 4, fp); - UserDictInfo info; - memset(&info, 0, sizeof(info)); - // By default, no limitation for lemma count and size - // thereby, reclaim_ratio is never used - wred += fwrite(&info, 1, sizeof(info), fp); - if (wred != sizeof(info) + sizeof(version)) { - fclose(fp); - unlink(file); - return false; - } - fclose(fp); - return true; -} - -bool UserDict::validate(const char *file) { - // b is ignored in POSIX compatible os including Linux - // while b is important flag for Windows to specify binary mode - FILE *fp = fopen(file, "rb"); - if (!fp) { - return false; - } - - size_t size; - size_t readed; - uint32 version; - UserDictInfo dict_info; - - // validate - int err = fseek(fp, 0, SEEK_END); - if (err) { - goto error; - } - - size = ftell(fp); - if (size < 4 + sizeof(dict_info)) { - goto error; - } - - err = fseek(fp, 0, SEEK_SET); - if (err) { - goto error; - } - - readed = fread(&version, 1, sizeof(version), fp); - if (readed < sizeof(version)) { - goto error; - } - if (version != kUserDictVersion) { - goto error; - } - - err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); - if (err) { - goto error; - } - - readed = fread(&dict_info, 1, sizeof(dict_info), fp); - if (readed != sizeof(dict_info)) { - goto error; - } - - if (size != get_dict_file_size(&dict_info)) { - goto error; - } - - fclose(fp); - return true; - -error: - fclose(fp); - return false; -} - -bool UserDict::load(const char *file, LemmaIdType start_id) { - if (0 != pthread_mutex_trylock(&g_mutex_)) { - return false; - } - // b is ignored in POSIX compatible os including Linux - // while b is important flag for Windows to specify binary mode - FILE *fp = fopen(file, "rb"); - if (!fp) { - pthread_mutex_unlock(&g_mutex_); - return false; - } - - size_t readed, toread; - UserDictInfo dict_info; - uint8 *lemmas = NULL; - uint32 *offsets = NULL; -#ifdef ___SYNC_ENABLED___ - uint32 *syncs = NULL; -#endif - uint32 *scores = NULL; - uint32 *ids = NULL; - uint32 *offsets_by_id = NULL; -#ifdef ___PREDICT_ENABLED___ - uint32 *predicts = NULL; -#endif - size_t i; - int err; - - err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); - if (err) goto error; - - readed = fread(&dict_info, 1, sizeof(dict_info), fp); - if (readed != sizeof(dict_info)) goto error; - - lemmas = (uint8 *)malloc(dict_info.lemma_size + (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)))); - - if (!lemmas) goto error; - - offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!offsets) goto error; - -#ifdef ___PREDICT_ENABLED___ - predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!predicts) goto error; -#endif - -#ifdef ___SYNC_ENABLED___ - syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2); - if (!syncs) goto error; -#endif - - scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!scores) goto error; - - ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!ids) goto error; - - offsets_by_id = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); - if (!offsets_by_id) goto error; - - err = fseek(fp, 4, SEEK_SET); - if (err) goto error; - - readed = 0; - while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) { - readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp); - } - if (readed < dict_info.lemma_size) goto error; - - toread = (dict_info.lemma_count << 2); - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8 *)offsets) + readed), 1, toread - readed, fp); - } - if (readed < toread) goto error; - -#ifdef ___PREDICT_ENABLED___ - toread = (dict_info.lemma_count << 2); - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8 *)predicts) + readed), 1, toread - readed, fp); - } - if (readed < toread) goto error; -#endif - - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8 *)scores) + readed), 1, toread - readed, fp); - } - if (readed < toread) goto error; - -#ifdef ___SYNC_ENABLED___ - toread = (dict_info.sync_count << 2); - readed = 0; - while (readed < toread && !ferror(fp) && !feof(fp)) { - readed += fread((((uint8 *)syncs) + readed), 1, toread - readed, fp); - } - if (readed < toread) goto error; -#endif - - for (i = 0; i < dict_info.lemma_count; i++) { - ids[i] = start_id + i; - offsets_by_id[i] = offsets[i]; - } - - lemmas_ = lemmas; - offsets_ = offsets; -#ifdef ___SYNC_ENABLED___ - syncs_ = syncs; - sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc; -#endif - offsets_by_id_ = offsets_by_id; - scores_ = scores; - ids_ = ids; -#ifdef ___PREDICT_ENABLED___ - predicts_ = predicts; -#endif - lemma_count_left_ = kUserDictPreAlloc; - lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)); - memcpy(&dict_info_, &dict_info, sizeof(dict_info)); - state_ = USER_DICT_SYNC; - - fclose(fp); - - pthread_mutex_unlock(&g_mutex_); - return true; - -error: - if (lemmas) free(lemmas); - if (offsets) free(offsets); -#ifdef ___SYNC_ENABLED___ - if (syncs) free(syncs); -#endif - if (scores) free(scores); - if (ids) free(ids); - if (offsets_by_id) free(offsets_by_id); -#ifdef ___PREDICT_ENABLED___ - if (predicts) free(predicts); -#endif - fclose(fp); - pthread_mutex_unlock(&g_mutex_); - return false; -} - -void UserDict::write_back() { - // XXX write back is only allowed from close_dict due to thread-safe sake - if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC) return; - int fd = open(dict_file_, O_WRONLY); - if (fd == -1) return; - switch (state_) { - case USER_DICT_DEFRAGMENTED: - write_back_all(fd); - break; - case USER_DICT_LEMMA_DIRTY: - write_back_lemma(fd); - break; - case USER_DICT_OFFSET_DIRTY: - write_back_offset(fd); - break; - case USER_DICT_SCORE_DIRTY: - write_back_score(fd); - break; -#ifdef ___SYNC_ENABLED___ - case USER_DICT_SYNC_DIRTY: - write_back_sync(fd); - break; -#endif - default: - break; - } - // It seems truncate is not need on Linux, Windows except Mac - // I am doing it here anyway for safety. - off_t cur = lseek(fd, 0, SEEK_CUR); - ftruncate(fd, cur); - close(fd); - state_ = USER_DICT_SYNC; -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::write_back_sync(int fd) { - int err = lseek(fd, - 4 + dict_info_.lemma_size + (dict_info_.lemma_count << 3) -#ifdef ___PREDICT_ENABLED___ - + (dict_info_.lemma_count << 2) -#endif - , - SEEK_SET); - if (err == -1) return; - write(fd, syncs_, dict_info_.sync_count << 2); - write(fd, &dict_info_, sizeof(dict_info_)); -} -#endif - -void UserDict::write_back_offset(int fd) { - int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET); - if (err == -1) return; - write(fd, offsets_, dict_info_.lemma_count << 2); -#ifdef ___PREDICT_ENABLED___ - write(fd, predicts_, dict_info_.lemma_count << 2); -#endif - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -void UserDict::write_back_score(int fd) { - int err = lseek(fd, - 4 + dict_info_.lemma_size + (dict_info_.lemma_count << 2) -#ifdef ___PREDICT_ENABLED___ - + (dict_info_.lemma_count << 2) -#endif - , - SEEK_SET); - if (err == -1) return; - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -void UserDict::write_back_lemma(int fd) { - int err = lseek(fd, 4, SEEK_SET); - if (err == -1) return; - // New lemmas are always appended, no need to write whole lemma block - size_t need_write = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_; - err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR); - if (err == -1) return; - write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write); - - write(fd, offsets_, dict_info_.lemma_count << 2); -#ifdef ___PREDICT_ENABLED___ - write(fd, predicts_, dict_info_.lemma_count << 2); -#endif - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -void UserDict::write_back_all(int fd) { - // XXX lemma_size is handled differently in writeall - // and writelemma. I update lemma_size and lemma_count in different - // places for these two cases. Should fix it to make it consistent. - int err = lseek(fd, 4, SEEK_SET); - if (err == -1) return; - write(fd, lemmas_, dict_info_.lemma_size); - write(fd, offsets_, dict_info_.lemma_count << 2); -#ifdef ___PREDICT_ENABLED___ - write(fd, predicts_, dict_info_.lemma_count << 2); -#endif - write(fd, scores_, dict_info_.lemma_count << 2); -#ifdef ___SYNC_ENABLED___ - write(fd, syncs_, dict_info_.sync_count << 2); -#endif - write(fd, &dict_info_, sizeof(dict_info_)); -} - -#ifdef ___CACHE_ENABLED___ -bool UserDict::load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length) { - UserDictCache *cache = &caches_[searchable->splids_len - 1]; - if (cache->head == cache->tail) return false; - - uint16 j, sig_len = kMaxLemmaSize / 4; - uint16 i = cache->head; - while (1) { - j = 0; - for (; j < sig_len; j++) { - if (cache->signatures[i][j] != searchable->signature[j]) break; - } - if (j < sig_len) { - i++; - if (i >= kUserDictCacheSize) i -= kUserDictCacheSize; - if (i == cache->tail) break; - continue; - } - *offset = cache->offsets[i]; - *length = cache->lengths[i]; - return true; - } - return false; -} - -void UserDict::save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length) { - UserDictCache *cache = &caches_[searchable->splids_len - 1]; - uint16 next = cache->tail; - - cache->offsets[next] = offset; - cache->lengths[next] = length; - uint16 sig_len = kMaxLemmaSize / 4; - uint16 j = 0; - for (; j < sig_len; j++) { - cache->signatures[next][j] = searchable->signature[j]; - } - - if (++next >= kUserDictCacheSize) { - next -= kUserDictCacheSize; - } - if (next == cache->head) { - cache->head++; - if (cache->head >= kUserDictCacheSize) { - cache->head -= kUserDictCacheSize; - } - } - cache->tail = next; -} - -void UserDict::reset_cache() { memset(caches_, 0, sizeof(caches_)); } - -bool UserDict::load_miss_cache(UserDictSearchable *searchable) { - UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; - if (cache->head == cache->tail) return false; - - uint16 j, sig_len = kMaxLemmaSize / 4; - uint16 i = cache->head; - while (1) { - j = 0; - for (; j < sig_len; j++) { - if (cache->signatures[i][j] != searchable->signature[j]) break; - } - if (j < sig_len) { - i++; - if (i >= kUserDictMissCacheSize) i -= kUserDictMissCacheSize; - if (i == cache->tail) break; - continue; - } - return true; - } - return false; -} - -void UserDict::save_miss_cache(UserDictSearchable *searchable) { - UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; - uint16 next = cache->tail; - - uint16 sig_len = kMaxLemmaSize / 4; - uint16 j = 0; - for (; j < sig_len; j++) { - cache->signatures[next][j] = searchable->signature[j]; - } - - if (++next >= kUserDictMissCacheSize) { - next -= kUserDictMissCacheSize; - } - if (next == cache->head) { - cache->head++; - if (cache->head >= kUserDictMissCacheSize) { - cache->head -= kUserDictMissCacheSize; - } - } - cache->tail = next; -} - -void UserDict::reset_miss_cache() { memset(miss_caches_, 0, sizeof(miss_caches_)); } - -void UserDict::cache_init() { - reset_cache(); - reset_miss_cache(); -} - -bool UserDict::cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length) { - bool hit = load_miss_cache(searchable); - if (hit) { - *offset = 0; - *length = 0; - return true; - } - hit = load_cache(searchable, offset, length); - if (hit) { - return true; - } - return false; -} - -void UserDict::cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length) { - switch (type) { - case USER_DICT_MISS_CACHE: - save_miss_cache(searchable); - break; - case USER_DICT_CACHE: - save_cache(searchable, offset, length); - break; - default: - break; - } -} - -#endif - -void UserDict::defragment(void) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - if (is_valid_state() == false) return; - // Fixup offsets_, set REMOVE flag to lemma's flag if needed - size_t first_freed = 0; - size_t first_inuse = 0; - while (first_freed < dict_info_.lemma_count) { - // Find first freed offset - while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 && first_freed < dict_info_.lemma_count) { - first_freed++; - } - if (first_freed < dict_info_.lemma_count) { - // Save REMOVE flag to lemma flag - int off = offsets_[first_freed]; - set_lemma_flag(off, kUserDictLemmaFlagRemove); - } else { - break; - } - // Find first inuse offse after first_freed - first_inuse = first_freed + 1; - while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) && (first_inuse < dict_info_.lemma_count)) { - // Save REMOVE flag to lemma flag - int off = offsets_[first_inuse]; - set_lemma_flag(off, kUserDictLemmaFlagRemove); - first_inuse++; - } - if (first_inuse >= dict_info_.lemma_count) { - break; - } - // Swap offsets_ - int tmp = offsets_[first_inuse]; - offsets_[first_inuse] = offsets_[first_freed]; - offsets_[first_freed] = tmp; - // Move scores_, no need to swap - tmp = scores_[first_inuse]; - scores_[first_inuse] = scores_[first_freed]; - scores_[first_freed] = tmp; - // Swap ids_ - LemmaIdType tmpid = ids_[first_inuse]; - ids_[first_inuse] = ids_[first_freed]; - ids_[first_freed] = tmpid; - // Go on - first_freed++; - } -#ifdef ___PREDICT_ENABLED___ - // Fixup predicts_ - first_freed = 0; - first_inuse = 0; - while (first_freed < dict_info_.lemma_count) { - // Find first freed offset - while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 && first_freed < dict_info_.lemma_count) { - first_freed++; - } - if (first_freed >= dict_info_.lemma_count) break; - // Find first inuse offse after first_freed - first_inuse = first_freed + 1; - while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove) && (first_inuse < dict_info_.lemma_count)) { - first_inuse++; - } - if (first_inuse >= dict_info_.lemma_count) { - break; - } - // Swap offsets_ - int tmp = predicts_[first_inuse]; - predicts_[first_inuse] = predicts_[first_freed]; - predicts_[first_freed] = tmp; - // Go on - first_freed++; - } -#endif - dict_info_.lemma_count = first_freed; - // Fixup lemmas_ - size_t begin = 0; - size_t end = 0; - size_t dst = 0; - int total_size = dict_info_.lemma_size + lemma_size_left_; - int total_count = dict_info_.lemma_count + lemma_count_left_; - size_t real_size = total_size - lemma_size_left_; - while (dst < real_size) { - unsigned char flag = get_lemma_flag(dst); - unsigned char nchr = get_lemma_nchar(dst); - if ((flag & kUserDictLemmaFlagRemove) == 0) { - dst += nchr * 4 + 2; - continue; - } - break; - } - if (dst >= real_size) return; - - end = dst; - while (end < real_size) { - begin = end + get_lemma_nchar(end) * 4 + 2; - repeat: - // not used any more - if (begin >= real_size) break; - unsigned char flag = get_lemma_flag(begin); - unsigned char nchr = get_lemma_nchar(begin); - if (flag & kUserDictLemmaFlagRemove) { - begin += nchr * 4 + 2; - goto repeat; - } - end = begin + nchr * 4 + 2; - while (end < real_size) { - unsigned char eflag = get_lemma_flag(end); - unsigned char enchr = get_lemma_nchar(end); - if ((eflag & kUserDictLemmaFlagRemove) == 0) { - end += enchr * 4 + 2; - continue; - } - break; - } - memmove(lemmas_ + dst, lemmas_ + begin, end - begin); - for (size_t j = 0; j < dict_info_.lemma_count; j++) { - if (offsets_[j] >= begin && offsets_[j] < end) { - offsets_[j] -= (begin - dst); - offsets_by_id_[ids_[j] - start_id_] = offsets_[j]; - } -#ifdef ___PREDICT_ENABLED___ - if (predicts_[j] >= begin && predicts_[j] < end) { - predicts_[j] -= (begin - dst); - } -#endif - } -#ifdef ___SYNC_ENABLED___ - for (size_t j = 0; j < dict_info_.sync_count; j++) { - if (syncs_[j] >= begin && syncs_[j] < end) { - syncs_[j] -= (begin - dst); - } - } -#endif - dst += (end - begin); - } - - dict_info_.free_count = 0; - dict_info_.free_size = 0; - dict_info_.lemma_size = dst; - lemma_size_left_ = total_size - dict_info_.lemma_size; - lemma_count_left_ = total_count - dict_info_.lemma_count; - - // XXX Without following code, - // offsets_by_id_ is not reordered. - // That's to say, all removed lemmas' ids are not collected back. - // There may not be room for addition of new lemmas due to - // offsests_by_id_ reason, although lemma_size_left_ is fixed. - // By default, we do want defrag as fast as possible, because - // during defrag procedure, other peers can not write new lemmas - // to user dictionary file. - // XXX If write-back is invoked immediately after - // this defragment, no need to fix up following in-mem data. - for (uint32 i = 0; i < dict_info_.lemma_count; i++) { - ids_[i] = start_id_ + i; - offsets_by_id_[i] = offsets_[i]; - } - - state_ = USER_DICT_DEFRAGMENTED; - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("defragment"); -#endif -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) { - if (is_valid_state() == false) return; - if (end > dict_info_.sync_count) end = dict_info_.sync_count; - memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2); - dict_info_.sync_count -= (end - start); - if (state_ < USER_DICT_SYNC_DIRTY) state_ = USER_DICT_SYNC_DIRTY; -} - -int UserDict::get_sync_count() { - if (is_valid_state() == false) return 0; - return dict_info_.sync_count; -} - -LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { - int again = 0; -begin: - LemmaIdType id; - uint32 *syncs_bak = syncs_; - syncs_ = NULL; - id = _put_lemma(lemma_str, splids, lemma_len, count, lmt); - syncs_ = syncs_bak; - if (id == 0 && again == 0) { - if ((dict_info_.limit_lemma_count > 0 && dict_info_.lemma_count >= dict_info_.limit_lemma_count) || (dict_info_.limit_lemma_size > 0 && dict_info_.lemma_size + (2 + (lemma_len << 2)) > dict_info_.limit_lemma_size)) { - // XXX Always reclaim and defrag in sync code path - // sync thread is background thread and ok with heavy work - reclaim(); - defragment(); - flush_cache(); - again = 1; - goto begin; - } - } - return id; -} - -int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 *lemmas, int len) { - int newly_added = 0; - - SpellingParser *spl_parser = new SpellingParser(); - if (!spl_parser) { - return 0; - } -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - char16 *ptr = lemmas; - - // Extract pinyin,words,frequence,last_mod_time - char16 *p = ptr, *py16 = ptr; - char16 *hz16 = NULL; - int py16_len = 0; - uint16 splid[kMaxLemmaSize]; - int splid_len = 0; - int hz16_len = 0; - char16 *fr16 = NULL; - int fr16_len = 0; - - while (p - ptr < len) { - // Pinyin - py16 = p; - splid_len = 0; - while (*p != 0x2c && (p - ptr) < len) { - if (*p == 0x20) splid_len++; - p++; - } - splid_len++; - if (p - ptr == len) break; - py16_len = p - py16; - if (kMaxLemmaSize < splid_len) { - break; - } - bool is_pre; - int splidl = spl_parser->splstr16_to_idxs_f(py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre); - if (splidl != splid_len) break; - // Phrase - hz16 = ++p; - while (*p != 0x2c && (p - ptr) < len) { - p++; - } - hz16_len = p - hz16; - if (hz16_len != splid_len) break; - // Frequency - fr16 = ++p; - fr16_len = 0; - while (*p != 0x2c && (p - ptr) < len) { - p++; - } - fr16_len = p - fr16; - uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len); - // Last modified time - fr16 = ++p; - fr16_len = 0; - while (*p != 0x3b && (p - ptr) < len) { - p++; - } - fr16_len = p - fr16; - uint64 last_mod = utf16le_atoll(fr16, fr16_len); - - put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod); - newly_added++; - - p++; - } - -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("put_lemmas_no_sync_from_utf16le_string"); -#endif - return newly_added; -} - -int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(char16 *str, int size, int *count) { - int len = 0; - *count = 0; - - int left_len = size; - - if (is_valid_state() == false) return len; - - SpellingTrie *spl_trie = &SpellingTrie::get_instance(); - if (!spl_trie) { - return 0; - } - - uint32 i; - for (i = 0; i < dict_info_.sync_count; i++) { - int offset = syncs_[i]; - uint32 nchar = get_lemma_nchar(offset); - uint16 *spl = get_lemma_spell_ids(offset); - uint16 *wrd = get_lemma_word(offset); - int score = _get_lemma_score(wrd, spl, nchar); - - static char score_temp[32], *pscore_temp = score_temp; - static char16 temp[256], *ptemp = temp; - - pscore_temp = score_temp; - ptemp = temp; - - uint32 j; - // Add pinyin - for (j = 0; j < nchar; j++) { - int ret_len = spl_trie->get_spelling_str16(spl[j], ptemp, temp + sizeof(temp) - ptemp); - if (ret_len <= 0) break; - ptemp += ret_len; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ' '; - } else { - j = 0; - break; - } - } - if (j < nchar) { - continue; - } - ptemp--; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ','; - } else { - continue; - } - // Add phrase - for (j = 0; j < nchar; j++) { - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = wrd[j]; - } else { - break; - } - } - if (j < nchar) { - continue; - } - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ','; - } else { - continue; - } - // Add frequency - uint32 intf = extract_score_freq(score); - int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp); - if (ret_len <= 0) continue; - ptemp += ret_len; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ','; - } else { - continue; - } - // Add last modified time - uint64 last_mod = extract_score_lmt(score); - ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp); - if (ret_len <= 0) continue; - ptemp += ret_len; - if (ptemp < temp + sizeof(temp) - 1) { - *(ptemp++) = ';'; - } else { - continue; - } - - // Write to string - int need_len = ptemp - temp; - if (need_len > left_len) break; - memcpy(str + len, temp, need_len * 2); - left_len -= need_len; - - len += need_len; - (*count)++; - } - - if (len > 0) { - if (state_ < USER_DICT_SYNC_DIRTY) state_ = USER_DICT_SYNC_DIRTY; - } - return len; -} - -#endif - -bool UserDict::state(UserDictStat *stat) { - if (is_valid_state() == false) return false; - if (!stat) return false; - stat->version = version_; - stat->file_name = dict_file_; - stat->load_time.tv_sec = load_time_.tv_sec; - stat->load_time.tv_usec = load_time_.tv_usec; - pthread_mutex_lock(&g_mutex_); - stat->last_update.tv_sec = g_last_update_.tv_sec; - stat->last_update.tv_usec = g_last_update_.tv_usec; - pthread_mutex_unlock(&g_mutex_); - stat->disk_size = get_dict_file_size(&dict_info_); - stat->lemma_count = dict_info_.lemma_count; - stat->lemma_size = dict_info_.lemma_size; - stat->delete_count = dict_info_.free_count; - stat->delete_size = dict_info_.free_size; -#ifdef ___SYNC_ENABLED___ - stat->sync_count = dict_info_.sync_count; -#endif - stat->limit_lemma_count = dict_info_.limit_lemma_count; - stat->limit_lemma_size = dict_info_.limit_lemma_size; - stat->reclaim_ratio = dict_info_.reclaim_ratio; - return true; -} - -void UserDict::set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio) { - dict_info_.limit_lemma_count = max_lemma_count; - dict_info_.limit_lemma_size = max_lemma_size; - if (reclaim_ratio > 100) reclaim_ratio = 100; - dict_info_.reclaim_ratio = reclaim_ratio; -} - -void UserDict::reclaim() { - if (is_valid_state() == false) return; - - switch (dict_info_.reclaim_ratio) { - case 0: - return; - case 100: - // TODO: CLEAR to be implemented - assert(false); - return; - default: - break; - } - - // XXX Reclaim is only based on count, not size - uint32 count = dict_info_.lemma_count; - int rc = count * dict_info_.reclaim_ratio / 100; - - UserDictScoreOffsetPair *score_offset_pairs = NULL; - score_offset_pairs = (UserDictScoreOffsetPair *)malloc(sizeof(UserDictScoreOffsetPair) * rc); - if (score_offset_pairs == NULL) { - return; - } - - for (int i = 0; i < rc; i++) { - int s = scores_[i]; - score_offset_pairs[i].score = s; - score_offset_pairs[i].offset_index = i; - } - - for (int i = (rc + 1) / 2; i >= 0; i--) shift_down(score_offset_pairs, i, rc); - - for (uint32 i = rc; i < dict_info_.lemma_count; i++) { - int s = scores_[i]; - if (s < score_offset_pairs[0].score) { - score_offset_pairs[0].score = s; - score_offset_pairs[0].offset_index = i; - shift_down(score_offset_pairs, 0, rc); - } - } - - for (int i = 0; i < rc; i++) { - int off = score_offset_pairs[i].offset_index; - remove_lemma_by_offset_index(off); - } - if (rc > 0) { - if (state_ < USER_DICT_OFFSET_DIRTY) state_ = USER_DICT_OFFSET_DIRTY; - } - - free(score_offset_pairs); -} - -inline void UserDict::swap(UserDictScoreOffsetPair *sop, int i, int j) { - int s = sop[i].score; - int p = sop[i].offset_index; - sop[i].score = sop[j].score; - sop[i].offset_index = sop[j].offset_index; - sop[j].score = s; - sop[j].offset_index = p; -} - -void UserDict::shift_down(UserDictScoreOffsetPair *sop, int i, int n) { - int par = i; - while (par < n) { - int left = par * 2 + 1; - int right = left + 1; - if (left >= n && right >= n) break; - if (right >= n) { - if (sop[left].score > sop[par].score) { - swap(sop, left, par); - par = left; - continue; - } - } else if (sop[left].score > sop[right].score && sop[left].score > sop[par].score) { - swap(sop, left, par); - par = left; - continue; - } else if (sop[right].score > sop[left].score && sop[right].score > sop[par].score) { - swap(sop, right, par); - par = right; - continue; - } - break; - } -} - -LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count) { return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL)); } - -LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - if (is_valid_state() == false) return 0; - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off != -1) { - int delta_score = count - scores_[off]; - dict_info_.total_nfreq += delta_score; - scores_[off] = build_score(lmt, count); - if (state_ < USER_DICT_SCORE_DIRTY) state_ = USER_DICT_SCORE_DIRTY; -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("_put_lemma(update)"); -#endif - return ids_[off]; - } else { - if ((dict_info_.limit_lemma_count > 0 && dict_info_.lemma_count >= dict_info_.limit_lemma_count) || (dict_info_.limit_lemma_size > 0 && dict_info_.lemma_size + (2 + (lemma_len << 2)) > dict_info_.limit_lemma_size)) { - // XXX Don't defragment here, it's too time-consuming. - return 0; - } - int flushed = 0; - if (lemma_count_left_ == 0 || lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) { - // XXX When there is no space for new lemma, we flush to disk - // flush_cache() may be called by upper user - // and better place shoule be found instead of here - flush_cache(); - flushed = 1; - // Or simply return and do nothing - // return 0; - } -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)"); -#endif - LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt); -#ifdef ___SYNC_ENABLED___ - if (syncs_ && id != 0) { - queue_lemma_for_sync(id); - } -#endif - return id; - } - return 0; -} - -#ifdef ___SYNC_ENABLED___ -void UserDict::queue_lemma_for_sync(LemmaIdType id) { - if (dict_info_.sync_count < sync_count_size_) { - syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; - } else { - uint32 *syncs = (uint32 *)realloc(syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2); - if (syncs) { - sync_count_size_ += kUserDictPreAlloc; - syncs_ = syncs; - syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; - } - } -} -#endif - -LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected) { -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_BEGIN; -#endif - if (is_valid_state() == false) return 0; - if (is_valid_lemma_id(lemma_id) == false) return 0; - uint32 offset = offsets_by_id_[lemma_id - start_id_]; - uint8 lemma_len = get_lemma_nchar(offset); - char16 *lemma_str = get_lemma_word(offset); - uint16 *splids = get_lemma_spell_ids(offset); - - int32 off = locate_in_offsets(lemma_str, splids, lemma_len); - if (off != -1) { - int score = scores_[off]; - int count = extract_score_freq(score); - uint64 lmt = extract_score_lmt(score); - if (count + delta_count > kUserDictMaxFrequency || count + delta_count < count) { - delta_count = kUserDictMaxFrequency - count; - } - count += delta_count; - dict_info_.total_nfreq += delta_count; - if (selected) { - lmt = time(NULL); - } - scores_[off] = build_score(lmt, count); - if (state_ < USER_DICT_SCORE_DIRTY) state_ = USER_DICT_SCORE_DIRTY; -#ifdef ___DEBUG_PERF___ - DEBUG_PERF_END; - LOGD_PERF("update_lemma"); -#endif -#ifdef ___SYNC_ENABLED___ - queue_lemma_for_sync(ids_[off]); -#endif - return ids_[off]; - } - return 0; -} - -size_t UserDict::get_total_lemma_count() { return dict_info_.total_nfreq; } - -void UserDict::set_total_lemma_count_of_others(size_t count) { total_other_nfreq_ = count; } - -LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { - LemmaIdType id = get_max_lemma_id() + 1; - size_t offset = dict_info_.lemma_size; - if (offset > kUserDictOffsetMask) return 0; - - lemmas_[offset] = 0; - lemmas_[offset + 1] = (uint8)lemma_len; - for (size_t i = 0; i < lemma_len; i++) { - *((uint16 *)&lemmas_[offset + 2 + (i << 1)]) = splids[i]; - *((char16 *)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)]) = lemma_str[i]; - } - uint32 off = dict_info_.lemma_count; - offsets_[off] = offset; - scores_[off] = build_score(lmt, count); - ids_[off] = id; -#ifdef ___PREDICT_ENABLED___ - predicts_[off] = offset; -#endif - - offsets_by_id_[id - start_id_] = offset; - - dict_info_.lemma_count++; - dict_info_.lemma_size += (2 + (lemma_len << 2)); - lemma_count_left_--; - lemma_size_left_ -= (2 + (lemma_len << 2)); - - // Sort - - UserDictSearchable searchable; - prepare_locate(&searchable, splids, lemma_len); - - size_t i = 0; - while (i < off) { - offset = offsets_[i]; - uint32 nchar = get_lemma_nchar(offset); - uint16 *spl = get_lemma_spell_ids(offset); - - if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable)) break; - i++; - } - if (i != off) { - uint32 temp = offsets_[off]; - memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2); - offsets_[i] = temp; - - temp = scores_[off]; - memmove(scores_ + i + 1, scores_ + i, (off - i) << 2); - scores_[i] = temp; - - temp = ids_[off]; - memmove(ids_ + i + 1, ids_ + i, (off - i) << 2); - ids_[i] = temp; - } - -#ifdef ___PREDICT_ENABLED___ - uint32 j = 0; - uint16 *words_new = get_lemma_word(predicts_[off]); - j = locate_where_to_insert_in_predicts(words_new, lemma_len); - if (j != off) { - uint32 temp = predicts_[off]; - memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2); - predicts_[j] = temp; - } -#endif - - if (state_ < USER_DICT_LEMMA_DIRTY) state_ = USER_DICT_LEMMA_DIRTY; - -#ifdef ___CACHE_ENABLED___ - cache_init(); -#endif - - dict_info_.total_nfreq += count; - return id; -} -} // namespace ime_pinyin +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../include/userdict.h" +#include "../include/splparser.h" +#include "../include/ngram.h" +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif +#include +#include +#include +#include +#include +#ifdef _WIN32 +#undef max +#undef min +#include +#else +#include +#endif +#include + +namespace ime_pinyin { + +#ifdef _WIN32 +static int gettimeofday(struct timeval *tp, void *) { + if (!tp) { + return -1; + } + + // 获取当前时间的 FILETIME + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + + // 将 FILETIME 转换为 1970 年以来的时间(Unix 时间) + ULARGE_INTEGER ull; + ull.LowPart = ft.dwLowDateTime; + ull.HighPart = ft.dwHighDateTime; + + // FILETIME 是从 1601 年 1 月 1 日开始的 100 纳秒单位时间 + const long long WINDOWS_TICKS_PER_SEC = 10000000LL; // 每秒 1e7 个 100 纳秒单位 + const long long EPOCH_DIFFERENCE = 11644473600LL; // Unix epoch 到 Windows epoch 的秒数差 + + long long unix_time_in_microseconds = ull.QuadPart / 10 - EPOCH_DIFFERENCE * 1000000LL; + + // 填充 timeval 结构 + tp->tv_sec = (long)(unix_time_in_microseconds / 1000000LL); + tp->tv_usec = (long)(unix_time_in_microseconds % 1000000LL); + + return 0; +} +#endif + +#ifdef ___DEBUG_PERF___ +static uint64 _ellapse_ = 0; +static struct timeval _tv_start_, _tv_end_; +#define DEBUG_PERF_BEGIN \ + do { \ + gettimeofday(&_tv_start_, NULL); \ + } while (0) +#define DEBUG_PERF_END \ + do { \ + gettimeofday(&_tv_end_, NULL); \ + _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + (_tv_end_.tv_usec - _tv_start_.tv_usec); \ + } while (0) +#define LOGD_PERF(message) ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_); +#else +#define DEBUG_PERF_BEGIN +#define DEBUG_PERF_END +#define LOGD_PERF(message) +#endif + +// XXX File load and write are thread-safe by g_mutex_ +#ifdef _WIN32 +static CRITICAL_SECTION g_mutex_; // 使用 Windows 的临界区对象 +#define pthread_mutex_lock(MUTEX) EnterCriticalSection(MUTEX) +#define pthread_mutex_unlock(MUTEX) LeaveCriticalSection(MUTEX) +#define pthread_mutex_trylock(MUTEX) (TryEnterCriticalSection(MUTEX) != 0) +#else +#include +static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER; +#endif +static struct timeval g_last_update_ = {0, 0}; + +inline uint32 UserDict::get_dict_file_size(UserDictInfo *info) { + return (4 + info->lemma_size + (info->lemma_count << 3) +#ifdef ___PREDICT_ENABLED___ + + (info->lemma_count << 2) +#endif +#ifdef ___SYNC_ENABLED___ + + (info->sync_count << 2) +#endif + + sizeof(*info)); +} + +inline LmaScoreType UserDict::translate_score(int raw_score) { + // 1) ori_freq: original user frequency + uint32 ori_freq = extract_score_freq(raw_score); + // 2) lmt_off: lmt index (week offset for example) + uint64 lmt_off = ((raw_score & 0xffff0000) >> 16); + if (kUserDictLMTBitWidth < 16) { + uint64 mask = ~(1 << kUserDictLMTBitWidth); + lmt_off &= mask; + } + // 3) now_off: current time index (current week offset for example) + // assuming load_time_ is around current time + uint64 now_off = load_time_.tv_sec; + now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity; + now_off = (now_off << (64 - kUserDictLMTBitWidth)); + now_off = (now_off >> (64 - kUserDictLMTBitWidth)); + // 4) factor: decide expand-factor + int delta = now_off - lmt_off; + if (delta > 4) delta = 4; + int factor = 80 - (delta << 4); + + double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_); + return (LmaScoreType)(log((double)factor * (double)ori_freq / tf) * NGram::kLogValueAmplifier); +} + +inline int UserDict::extract_score_freq(int raw_score) { + // Frequence stored in lowest 16 bits + int freq = (raw_score & 0x0000ffff); + return freq; +} + +inline uint64 UserDict::extract_score_lmt(int raw_score) { + uint64 lmt = ((raw_score & 0xffff0000) >> 16); + if (kUserDictLMTBitWidth < 16) { + uint64 mask = ~(1 << kUserDictLMTBitWidth); + lmt &= mask; + } + lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince; + return lmt; +} + +inline int UserDict::build_score(uint64 lmt, int freq) { + lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity; + lmt = (lmt << (64 - kUserDictLMTBitWidth)); + lmt = (lmt >> (64 - kUserDictLMTBitWidth)); + uint16 lmt16 = (uint16)lmt; + int s = freq; + s &= 0x0000ffff; + s = (lmt16 << 16) | s; + return s; +} + +inline int64 UserDict::utf16le_atoll(uint16 *s, int len) { + int64 ret = 0; + if (len <= 0) return ret; + + int flag = 1; + const uint16 *endp = s + len; + if (*s == '-') { + flag = -1; + s++; + } else if (*s == '+') { + s++; + } + + while (*s >= '0' && *s <= '9' && s < endp) { + ret += ret * 10 + (*s) - '0'; + s++; + } + return ret * flag; +} + +inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) { + if (!s || size <= 0) return 0; + uint16 *endp = s + size; + int ret_len = 0; + if (v < 0) { + *(s++) = '-'; + ++ret_len; + v *= -1; + } + + uint16 *b = s; + while (s < endp && v != 0) { + *(s++) = '0' + (v % 10); + v = v / 10; + ++ret_len; + } + + if (v != 0) return 0; + + --s; + + while (b < s) { + *b = *s; + ++b, --s; + } + + return ret_len; +} + +inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) { + offset &= kUserDictOffsetMask; + lemmas_[offset] |= flag; +} + +inline char UserDict::get_lemma_flag(uint32 offset) { + offset &= kUserDictOffsetMask; + return (char)(lemmas_[offset]); +} + +inline char UserDict::get_lemma_nchar(uint32 offset) { + offset &= kUserDictOffsetMask; + return (char)(lemmas_[offset + 1]); +} + +inline uint16 *UserDict::get_lemma_spell_ids(uint32 offset) { + offset &= kUserDictOffsetMask; + return (uint16 *)(lemmas_ + offset + 2); +} + +inline uint16 *UserDict::get_lemma_word(uint32 offset) { + offset &= kUserDictOffsetMask; + uint8 nchar = get_lemma_nchar(offset); + return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1)); +} + +inline LemmaIdType UserDict::get_max_lemma_id() { + // When a lemma is deleted, we don't not claim its id back for + // simplicity and performance + return start_id_ + dict_info_.lemma_count - 1; +} + +inline bool UserDict::is_valid_lemma_id(LemmaIdType id) { + if (id >= start_id_ && id <= get_max_lemma_id()) return true; + return false; +} + +inline bool UserDict::is_valid_state() { + if (state_ == USER_DICT_NONE) return false; + return true; +} + +UserDict::UserDict() + : start_id_(0), + version_(0), + lemmas_(NULL), + offsets_(NULL), + scores_(NULL), + ids_(NULL), +#ifdef ___PREDICT_ENABLED___ + predicts_(NULL), +#endif +#ifdef ___SYNC_ENABLED___ + syncs_(NULL), + sync_count_size_(0), +#endif + offsets_by_id_(NULL), + lemma_count_left_(0), + lemma_size_left_(0), + dict_file_(NULL), + state_(USER_DICT_NONE) { + memset(&dict_info_, 0, sizeof(dict_info_)); + memset(&load_time_, 0, sizeof(load_time_)); +#ifdef ___CACHE_ENABLED___ + cache_init(); +#endif +} + +UserDict::~UserDict() { close_dict(); } + +bool UserDict::load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + dict_file_ = strdup(file_name); + if (!dict_file_) return false; + + start_id_ = start_id; + + if (false == validate(file_name) && false == reset(file_name)) { + goto error; + } + if (false == load(file_name, start_id)) { + goto error; + } + + state_ = USER_DICT_SYNC; + + gettimeofday(&load_time_, NULL); + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("load_dict"); +#endif + return true; +error: + free((void *)dict_file_); + dict_file_ = NULL; + start_id_ = 0; + return false; +} + +bool UserDict::close_dict() { + if (state_ == USER_DICT_NONE) return true; + if (state_ == USER_DICT_SYNC) goto out; + + // If dictionary is written back by others, + // we can not simply write back here + // To do a safe flush, we have to discard all newly added + // lemmas and try to reload dict file. + pthread_mutex_lock(&g_mutex_); + if (load_time_.tv_sec > g_last_update_.tv_sec || (load_time_.tv_sec == g_last_update_.tv_sec && load_time_.tv_usec > g_last_update_.tv_usec)) { + write_back(); + gettimeofday(&g_last_update_, NULL); + } + pthread_mutex_unlock(&g_mutex_); + +out: + free((void *)dict_file_); + free(lemmas_); + free(offsets_); + free(offsets_by_id_); + free(scores_); + free(ids_); +#ifdef ___PREDICT_ENABLED___ + free(predicts_); +#endif + + version_ = 0; + dict_file_ = NULL; + lemmas_ = NULL; +#ifdef ___SYNC_ENABLED___ + syncs_ = NULL; + sync_count_size_ = 0; +#endif + offsets_ = NULL; + offsets_by_id_ = NULL; + scores_ = NULL; + ids_ = NULL; +#ifdef ___PREDICT_ENABLED___ + predicts_ = NULL; +#endif + + memset(&dict_info_, 0, sizeof(dict_info_)); + lemma_count_left_ = 0; + lemma_size_left_ = 0; + state_ = USER_DICT_NONE; + + return true; +} + +size_t UserDict::number_of_lemmas() { return dict_info_.lemma_count; } + +void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { return; } + +MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num) { + if (is_valid_state() == false) return 0; + + bool need_extend = false; + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1, lpi_items, lpi_max, &need_extend); +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("extend_dict"); +#endif + return ((*lpi_num > 0 || need_extend) ? 1 : 0); +} + +int UserDict::is_fuzzy_prefix_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable) { + if (len1 < searchable->splids_len) return 0; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + uint32 i = 0; + for (i = 0; i < searchable->splids_len; i++) { + const char py1 = *spl_trie.get_spelling_str(id1[i]); + uint16 off = 8 * (i % 4); + const char py2 = ((searchable->signature[i / 4] & (0xff << off)) >> off); + if (py1 == py2) continue; + return 0; + } + return 1; +} + +int UserDict::fuzzy_compare_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable) { + if (len1 < searchable->splids_len) return -1; + if (len1 > searchable->splids_len) return 1; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + uint32 i = 0; + for (i = 0; i < len1; i++) { + const char py1 = *spl_trie.get_spelling_str(id1[i]); + uint16 off = 8 * (i % 4); + const char py2 = ((searchable->signature[i / 4] & (0xff << off)) >> off); + if (py1 == py2) continue; + if (py1 > py2) return 1; + return -1; + } + return 0; +} + +bool UserDict::is_prefix_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable) { + if (fulllen < searchable->splids_len) return false; + + uint32 i = 0; + for (; i < searchable->splids_len; i++) { + uint16 start_id = searchable->splid_start[i]; + uint16 count = searchable->splid_count[i]; + if (fullids[i] >= start_id && fullids[i] < start_id + count) + continue; + else + return false; + } + return true; +} + +bool UserDict::equal_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable) { + if (fulllen != searchable->splids_len) return false; + + uint32 i = 0; + for (; i < fulllen; i++) { + uint16 start_id = searchable->splid_start[i]; + uint16 count = searchable->splid_count[i]; + if (fullids[i] >= start_id && fullids[i] < start_id + count) + continue; + else + return false; + } + return true; +} + +int32 UserDict::locate_first_in_offsets(const UserDictSearchable *searchable) { + int32 begin = 0; + int32 end = dict_info_.lemma_count - 1; + int32 middle = -1; + + int32 first_prefix = middle; + int32 last_matched = middle; + + while (begin <= end) { + middle = (begin + end) >> 1; + uint32 offset = offsets_[middle]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 *splids = get_lemma_spell_ids(offset); + int cmp = fuzzy_compare_spell_id(splids, nchar, searchable); + int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable); + + if (pre) first_prefix = middle; + + if (cmp < 0) { + begin = middle + 1; + } else if (cmp > 0) { + end = middle - 1; + } else { + end = middle - 1; + last_matched = middle; + } + } + + return first_prefix; +} + +void UserDict::prepare_locate(UserDictSearchable *searchable, const uint16 *splid_str, uint16 splid_str_len) { + searchable->splids_len = splid_str_len; + memset(searchable->signature, 0, sizeof(searchable->signature)); + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + uint32 i = 0; + for (; i < splid_str_len; i++) { + if (spl_trie.is_half_id(splid_str[i])) { + searchable->splid_count[i] = spl_trie.half_to_full(splid_str[i], &(searchable->splid_start[i])); + } else { + searchable->splid_count[i] = 1; + searchable->splid_start[i] = splid_str[i]; + } + const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]); + searchable->signature[i >> 2] |= (py << (8 * (i % 4))); + } +} + +size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max) { return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL); } + +size_t UserDict::_get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool *need_extend) { + bool tmp_extend; + if (!need_extend) need_extend = &tmp_extend; + + *need_extend = false; + + if (is_valid_state() == false) return 0; + if (lpi_max <= 0) return 0; + + if (0 == pthread_mutex_trylock(&g_mutex_)) { + if (load_time_.tv_sec < g_last_update_.tv_sec || (load_time_.tv_sec == g_last_update_.tv_sec && load_time_.tv_usec < g_last_update_.tv_usec)) { + // Others updated disk file, have to reload + pthread_mutex_unlock(&g_mutex_); + flush_cache(); + } else { + pthread_mutex_unlock(&g_mutex_); + } + } else { + } + + UserDictSearchable searchable; + prepare_locate(&searchable, splid_str, splid_str_len); + + uint32 max_off = dict_info_.lemma_count; +#ifdef ___CACHE_ENABLED___ + int32 middle; + uint32 start, count; + bool cached = cache_hit(&searchable, &start, &count); + if (cached) { + middle = start; + max_off = start + count; + } else { + middle = locate_first_in_offsets(&searchable); + start = middle; + } +#else + int32 middle = locate_first_in_offsets(&searchable); +#endif + + if (middle == -1) { +#ifdef ___CACHE_ENABLED___ + if (!cached) cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0); +#endif + return 0; + } + + size_t lpi_current = 0; + + bool fuzzy_break = false; + bool prefix_break = false; + while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) { + if (lpi_current >= lpi_max) break; + uint32 offset = offsets_[middle]; + // Ignore deleted lemmas + if (offset & kUserDictOffsetFlagRemove) { + middle++; + continue; + } + uint8 nchar = get_lemma_nchar(offset); + uint16 *splids = get_lemma_spell_ids(offset); +#ifdef ___CACHE_ENABLED___ + if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { +#else + if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { +#endif + fuzzy_break = true; + } + + if (prefix_break == false) { + if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) { + if (*need_extend == false && is_prefix_spell_id(splids, nchar, &searchable)) { + *need_extend = true; + } + } else { + prefix_break = true; + } + } + + if (equal_spell_id(splids, nchar, &searchable) == true) { + lpi_items[lpi_current].psb = translate_score(scores_[middle]); + lpi_items[lpi_current].id = ids_[middle]; + lpi_items[lpi_current].lma_len = nchar; + lpi_current++; + } + middle++; + } + +#ifdef ___CACHE_ENABLED___ + if (!cached) { + count = middle - start; + cache_push(USER_DICT_CACHE, &searchable, start, count); + } +#endif + + return lpi_current; +} + +uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max) { + if (is_valid_state() == false) return 0; + if (is_valid_lemma_id(id_lemma) == false) return 0; + uint32 offset = offsets_by_id_[id_lemma - start_id_]; + uint8 nchar = get_lemma_nchar(offset); + char16 *str = get_lemma_word(offset); + uint16 m = nchar < str_max - 1 ? nchar : str_max - 1; + int i = 0; + for (; i < m; i++) { + str_buf[i] = str[i]; + } + str_buf[i] = 0; + return m; +} + +uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid) { + if (is_valid_lemma_id(id_lemma) == false) return 0; + uint32 offset = offsets_by_id_[id_lemma - start_id_]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 *ids = get_lemma_spell_ids(offset); + int i = 0; + for (; i < nchar && i < splids_max; i++) splids[i] = ids[i]; + return i; +} + +size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used) { + uint32 new_added = 0; +#ifdef ___PREDICT_ENABLED___ + int32 end = dict_info_.lemma_count - 1; + int j = locate_first_in_predicts((const uint16 *)last_hzs, hzs_len); + if (j == -1) return 0; + + while (j <= end) { + uint32 offset = predicts_[j]; + // Ignore deleted lemmas + if (offset & kUserDictOffsetFlagRemove) { + j++; + continue; + } + uint32 nchar = get_lemma_nchar(offset); + uint16 *words = get_lemma_word(offset); + uint16 *splids = get_lemma_spell_ids(offset); + + if (nchar <= hzs_len) { + j++; + continue; + } + + if (memcmp(words, last_hzs, hzs_len << 1) == 0) { + if (new_added >= npre_max) { + return new_added; + } + uint32 cpy_len = (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1)) - (hzs_len << 1); + npre_items[new_added].his_len = hzs_len; + npre_items[new_added].psb = get_lemma_score(words, splids, nchar); + memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len); + if ((cpy_len >> 1) < kMaxPredictSize) { + npre_items[new_added].pre_hzs[cpy_len >> 1] = 0; + } + new_added++; + } else { + break; + } + + j++; + } +#endif + return new_added; +} + +int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len) { + int32 max_off = dict_info_.lemma_count; + + UserDictSearchable searchable; + prepare_locate(&searchable, splid_str, lemma_len); +#ifdef ___CACHE_ENABLED___ + int32 off; + uint32 start, count; + bool cached = load_cache(&searchable, &start, &count); + if (cached) { + off = start; + max_off = start + count; + } else { + off = locate_first_in_offsets(&searchable); + start = off; + } +#else + int32 off = locate_first_in_offsets(&searchable); +#endif + + if (off == -1) { + return off; + } + + while (off < max_off) { + uint32 offset = offsets_[off]; + if (offset & kUserDictOffsetFlagRemove) { + off++; + continue; + } + uint16 *splids = get_lemma_spell_ids(offset); +#ifdef ___CACHE_ENABLED___ + if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) break; +#else + if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) break; +#endif + if (equal_spell_id(splids, lemma_len, &searchable) == true) { + uint16 *str = get_lemma_word(offset); + uint32 i = 0; + for (i = 0; i < lemma_len; i++) { + if (str[i] == lemma_str[i]) continue; + break; + } + if (i < lemma_len) { + off++; + continue; + } +#ifdef ___CACHE_ENABLED___ + // No need to save_cache here, since current function is invoked by + // put_lemma. It's rarely possible for a user input same lemma twice. + // That means first time user type a new lemma, it is newly added into + // user dictionary, then it's possible that user type the same lemma + // again. + // Another reason save_cache can not be invoked here is this function + // aborts when lemma is found, and it never knows the count. +#endif + return off; + } + off++; + } + + return -1; +} + +#ifdef ___PREDICT_ENABLED___ +uint32 UserDict::locate_where_to_insert_in_predicts(const uint16 *words, int lemma_len) { + int32 begin = 0; + int32 end = dict_info_.lemma_count - 1; + int32 middle = end; + + uint32 last_matched = middle; + + while (begin <= end) { + middle = (begin + end) >> 1; + uint32 offset = offsets_[middle]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 *ws = get_lemma_word(offset); + + uint32 minl = nchar < lemma_len ? nchar : lemma_len; + uint32 k = 0; + int cmp = 0; + + for (; k < minl; k++) { + if (ws[k] < words[k]) { + cmp = -1; + break; + } else if (ws[k] > words[k]) { + cmp = 1; + break; + } + } + if (cmp == 0) { + if (nchar < lemma_len) + cmp = -1; + else if (nchar > lemma_len) + cmp = 1; + } + + if (cmp < 0) { + begin = middle + 1; + last_matched = middle; + } else if (cmp > 0) { + end = middle - 1; + } else { + end = middle - 1; + last_matched = middle; + } + } + + return last_matched; +} + +int32 UserDict::locate_first_in_predicts(const uint16 *words, int lemma_len) { + int32 begin = 0; + int32 end = dict_info_.lemma_count - 1; + int32 middle = -1; + + int32 last_matched = middle; + + while (begin <= end) { + middle = (begin + end) >> 1; + uint32 offset = offsets_[middle]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 *ws = get_lemma_word(offset); + + uint32 minl = nchar < lemma_len ? nchar : lemma_len; + uint32 k = 0; + int cmp = 0; + + for (; k < minl; k++) { + if (ws[k] < words[k]) { + cmp = -1; + break; + } else if (ws[k] > words[k]) { + cmp = 1; + break; + } + } + if (cmp == 0) { + if (nchar >= lemma_len) last_matched = middle; + if (nchar < lemma_len) + cmp = -1; + else if (nchar > lemma_len) + cmp = 1; + } + + if (cmp < 0) { + begin = middle + 1; + } else if (cmp > 0) { + end = middle - 1; + } else { + end = middle - 1; + } + } + + return last_matched; +} + +#endif + +LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off == -1) { + return 0; + } + + return ids_[off]; +} + +LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) { + if (is_valid_state() == false) return 0; + if (is_valid_lemma_id(lemma_id) == false) return 0; + + return translate_score(_get_lemma_score(lemma_id)); +} + +LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { + if (is_valid_state() == false) return 0; + return translate_score(_get_lemma_score(lemma_str, splids, lemma_len)); +} + +int UserDict::_get_lemma_score(LemmaIdType lemma_id) { + if (is_valid_state() == false) return 0; + if (is_valid_lemma_id(lemma_id) == false) return 0; + + uint32 offset = offsets_by_id_[lemma_id - start_id_]; + + uint32 nchar = get_lemma_nchar(offset); + uint16 *spl = get_lemma_spell_ids(offset); + uint16 *wrd = get_lemma_word(offset); + + int32 off = locate_in_offsets(wrd, spl, nchar); + if (off == -1) { + return 0; + } + + return scores_[off]; +} + +int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { + if (is_valid_state() == false) return 0; + + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off == -1) { + return 0; + } + + return scores_[off]; +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::remove_lemma_from_sync_list(uint32 offset) { + offset &= kUserDictOffsetMask; + uint32 i = 0; + for (; i < dict_info_.sync_count; i++) { + unsigned int off = (syncs_[i] & kUserDictOffsetMask); + if (off == offset) break; + } + if (i < dict_info_.sync_count) { + syncs_[i] = syncs_[dict_info_.sync_count - 1]; + dict_info_.sync_count--; + } +} +#endif + +#ifdef ___PREDICT_ENABLED___ +void UserDict::remove_lemma_from_predict_list(uint32 offset) { + offset &= kUserDictOffsetMask; + uint32 i = 0; + for (; i < dict_info_.lemma_count; i++) { + unsigned int off = (predicts_[i] & kUserDictOffsetMask); + if (off == offset) { + predicts_[i] |= kUserDictOffsetFlagRemove; + break; + } + } +} +#endif + +bool UserDict::remove_lemma_by_offset_index(int offset_index) { + if (is_valid_state() == false) return 0; + + int32 off = offset_index; + if (off == -1) { + return false; + } + + uint32 offset = offsets_[off]; + uint32 nchar = get_lemma_nchar(offset); + + offsets_[off] |= kUserDictOffsetFlagRemove; + +#ifdef ___SYNC_ENABLED___ + // Remove corresponding sync item + remove_lemma_from_sync_list(offset); +#endif + +#ifdef ___PREDICT_ENABLED___ + remove_lemma_from_predict_list(offset); +#endif + dict_info_.free_count++; + dict_info_.free_size += (2 + (nchar << 2)); + + if (state_ < USER_DICT_OFFSET_DIRTY) state_ = USER_DICT_OFFSET_DIRTY; + return true; +} + +bool UserDict::remove_lemma(LemmaIdType lemma_id) { + if (is_valid_state() == false) return 0; + if (is_valid_lemma_id(lemma_id) == false) return false; + uint32 offset = offsets_by_id_[lemma_id - start_id_]; + + uint32 nchar = get_lemma_nchar(offset); + uint16 *spl = get_lemma_spell_ids(offset); + uint16 *wrd = get_lemma_word(offset); + + int32 off = locate_in_offsets(wrd, spl, nchar); + + return remove_lemma_by_offset_index(off); +} + +void UserDict::flush_cache() { + LemmaIdType start_id = start_id_; + if (!dict_file_) return; + const char *file = strdup(dict_file_); + if (!file) return; + close_dict(); + load_dict(file, start_id, kUserDictIdEnd); + free((void *)file); +#ifdef ___CACHE_ENABLED___ + cache_init(); +#endif + return; +} + +bool UserDict::reset(const char *file) { + FILE *fp = fopen(file, "w+"); + if (!fp) { + return false; + } + uint32 version = kUserDictVersion; + size_t wred = fwrite(&version, 1, 4, fp); + UserDictInfo info; + memset(&info, 0, sizeof(info)); + // By default, no limitation for lemma count and size + // thereby, reclaim_ratio is never used + wred += fwrite(&info, 1, sizeof(info), fp); + if (wred != sizeof(info) + sizeof(version)) { + fclose(fp); + unlink(file); + return false; + } + fclose(fp); + return true; +} + +bool UserDict::validate(const char *file) { + // b is ignored in POSIX compatible os including Linux + // while b is important flag for Windows to specify binary mode + FILE *fp = fopen(file, "rb"); + if (!fp) { + return false; + } + + size_t size; + size_t readed; + uint32 version; + UserDictInfo dict_info; + + // validate + int err = fseek(fp, 0, SEEK_END); + if (err) { + goto error; + } + + size = ftell(fp); + if (size < 4 + sizeof(dict_info)) { + goto error; + } + + err = fseek(fp, 0, SEEK_SET); + if (err) { + goto error; + } + + readed = fread(&version, 1, sizeof(version), fp); + if (readed < sizeof(version)) { + goto error; + } + if (version != kUserDictVersion) { + goto error; + } + + err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); + if (err) { + goto error; + } + + readed = fread(&dict_info, 1, sizeof(dict_info), fp); + if (readed != sizeof(dict_info)) { + goto error; + } + + if (size != get_dict_file_size(&dict_info)) { + goto error; + } + + fclose(fp); + return true; + +error: + fclose(fp); + return false; +} + +bool UserDict::load(const char *file, LemmaIdType start_id) { + if (0 != pthread_mutex_trylock(&g_mutex_)) { + return false; + } + // b is ignored in POSIX compatible os including Linux + // while b is important flag for Windows to specify binary mode + FILE *fp = fopen(file, "rb"); + if (!fp) { + pthread_mutex_unlock(&g_mutex_); + return false; + } + + size_t readed, toread; + UserDictInfo dict_info; + uint8 *lemmas = NULL; + uint32 *offsets = NULL; +#ifdef ___SYNC_ENABLED___ + uint32 *syncs = NULL; +#endif + uint32 *scores = NULL; + uint32 *ids = NULL; + uint32 *offsets_by_id = NULL; +#ifdef ___PREDICT_ENABLED___ + uint32 *predicts = NULL; +#endif + size_t i; + int err; + + err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); + if (err) goto error; + + readed = fread(&dict_info, 1, sizeof(dict_info), fp); + if (readed != sizeof(dict_info)) goto error; + + lemmas = (uint8 *)malloc(dict_info.lemma_size + (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)))); + + if (!lemmas) goto error; + + offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!offsets) goto error; + +#ifdef ___PREDICT_ENABLED___ + predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!predicts) goto error; +#endif + +#ifdef ___SYNC_ENABLED___ + syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2); + if (!syncs) goto error; +#endif + + scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!scores) goto error; + + ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!ids) goto error; + + offsets_by_id = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!offsets_by_id) goto error; + + err = fseek(fp, 4, SEEK_SET); + if (err) goto error; + + readed = 0; + while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) { + readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp); + } + if (readed < dict_info.lemma_size) goto error; + + toread = (dict_info.lemma_count << 2); + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8 *)offsets) + readed), 1, toread - readed, fp); + } + if (readed < toread) goto error; + +#ifdef ___PREDICT_ENABLED___ + toread = (dict_info.lemma_count << 2); + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8 *)predicts) + readed), 1, toread - readed, fp); + } + if (readed < toread) goto error; +#endif + + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8 *)scores) + readed), 1, toread - readed, fp); + } + if (readed < toread) goto error; + +#ifdef ___SYNC_ENABLED___ + toread = (dict_info.sync_count << 2); + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8 *)syncs) + readed), 1, toread - readed, fp); + } + if (readed < toread) goto error; +#endif + + for (i = 0; i < dict_info.lemma_count; i++) { + ids[i] = start_id + i; + offsets_by_id[i] = offsets[i]; + } + + lemmas_ = lemmas; + offsets_ = offsets; +#ifdef ___SYNC_ENABLED___ + syncs_ = syncs; + sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc; +#endif + offsets_by_id_ = offsets_by_id; + scores_ = scores; + ids_ = ids; +#ifdef ___PREDICT_ENABLED___ + predicts_ = predicts; +#endif + lemma_count_left_ = kUserDictPreAlloc; + lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)); + memcpy(&dict_info_, &dict_info, sizeof(dict_info)); + state_ = USER_DICT_SYNC; + + fclose(fp); + + pthread_mutex_unlock(&g_mutex_); + return true; + +error: + if (lemmas) free(lemmas); + if (offsets) free(offsets); +#ifdef ___SYNC_ENABLED___ + if (syncs) free(syncs); +#endif + if (scores) free(scores); + if (ids) free(ids); + if (offsets_by_id) free(offsets_by_id); +#ifdef ___PREDICT_ENABLED___ + if (predicts) free(predicts); +#endif + fclose(fp); + pthread_mutex_unlock(&g_mutex_); + return false; +} + +void UserDict::write_back() { + // XXX write back is only allowed from close_dict due to thread-safe sake + if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC) return; + int fd = open(dict_file_, O_WRONLY); + if (fd == -1) return; + switch (state_) { + case USER_DICT_DEFRAGMENTED: + write_back_all(fd); + break; + case USER_DICT_LEMMA_DIRTY: + write_back_lemma(fd); + break; + case USER_DICT_OFFSET_DIRTY: + write_back_offset(fd); + break; + case USER_DICT_SCORE_DIRTY: + write_back_score(fd); + break; +#ifdef ___SYNC_ENABLED___ + case USER_DICT_SYNC_DIRTY: + write_back_sync(fd); + break; +#endif + default: + break; + } + // It seems truncate is not need on Linux, Windows except Mac + // I am doing it here anyway for safety. + off_t cur = lseek(fd, 0, SEEK_CUR); +#ifndef _WIN32 + ftruncate(fd, cur); +#endif + close(fd); + state_ = USER_DICT_SYNC; +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::write_back_sync(int fd) { + int err = lseek(fd, + 4 + dict_info_.lemma_size + (dict_info_.lemma_count << 3) +#ifdef ___PREDICT_ENABLED___ + + (dict_info_.lemma_count << 2) +#endif + , + SEEK_SET); + if (err == -1) return; + write(fd, syncs_, dict_info_.sync_count << 2); + write(fd, &dict_info_, sizeof(dict_info_)); +} +#endif + +void UserDict::write_back_offset(int fd) { + int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET); + if (err == -1) return; + write(fd, offsets_, dict_info_.lemma_count << 2); +#ifdef ___PREDICT_ENABLED___ + write(fd, predicts_, dict_info_.lemma_count << 2); +#endif + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +void UserDict::write_back_score(int fd) { + int err = lseek(fd, + 4 + dict_info_.lemma_size + (dict_info_.lemma_count << 2) +#ifdef ___PREDICT_ENABLED___ + + (dict_info_.lemma_count << 2) +#endif + , + SEEK_SET); + if (err == -1) return; + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +void UserDict::write_back_lemma(int fd) { + int err = lseek(fd, 4, SEEK_SET); + if (err == -1) return; + // New lemmas are always appended, no need to write whole lemma block + size_t need_write = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_; + err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR); + if (err == -1) return; + write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write); + + write(fd, offsets_, dict_info_.lemma_count << 2); +#ifdef ___PREDICT_ENABLED___ + write(fd, predicts_, dict_info_.lemma_count << 2); +#endif + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +void UserDict::write_back_all(int fd) { + // XXX lemma_size is handled differently in writeall + // and writelemma. I update lemma_size and lemma_count in different + // places for these two cases. Should fix it to make it consistent. + int err = lseek(fd, 4, SEEK_SET); + if (err == -1) return; + write(fd, lemmas_, dict_info_.lemma_size); + write(fd, offsets_, dict_info_.lemma_count << 2); +#ifdef ___PREDICT_ENABLED___ + write(fd, predicts_, dict_info_.lemma_count << 2); +#endif + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +#ifdef ___CACHE_ENABLED___ +bool UserDict::load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length) { + UserDictCache *cache = &caches_[searchable->splids_len - 1]; + if (cache->head == cache->tail) return false; + + uint16 j, sig_len = kMaxLemmaSize / 4; + uint16 i = cache->head; + while (1) { + j = 0; + for (; j < sig_len; j++) { + if (cache->signatures[i][j] != searchable->signature[j]) break; + } + if (j < sig_len) { + i++; + if (i >= kUserDictCacheSize) i -= kUserDictCacheSize; + if (i == cache->tail) break; + continue; + } + *offset = cache->offsets[i]; + *length = cache->lengths[i]; + return true; + } + return false; +} + +void UserDict::save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length) { + UserDictCache *cache = &caches_[searchable->splids_len - 1]; + uint16 next = cache->tail; + + cache->offsets[next] = offset; + cache->lengths[next] = length; + uint16 sig_len = kMaxLemmaSize / 4; + uint16 j = 0; + for (; j < sig_len; j++) { + cache->signatures[next][j] = searchable->signature[j]; + } + + if (++next >= kUserDictCacheSize) { + next -= kUserDictCacheSize; + } + if (next == cache->head) { + cache->head++; + if (cache->head >= kUserDictCacheSize) { + cache->head -= kUserDictCacheSize; + } + } + cache->tail = next; +} + +void UserDict::reset_cache() { memset(caches_, 0, sizeof(caches_)); } + +bool UserDict::load_miss_cache(UserDictSearchable *searchable) { + UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; + if (cache->head == cache->tail) return false; + + uint16 j, sig_len = kMaxLemmaSize / 4; + uint16 i = cache->head; + while (1) { + j = 0; + for (; j < sig_len; j++) { + if (cache->signatures[i][j] != searchable->signature[j]) break; + } + if (j < sig_len) { + i++; + if (i >= kUserDictMissCacheSize) i -= kUserDictMissCacheSize; + if (i == cache->tail) break; + continue; + } + return true; + } + return false; +} + +void UserDict::save_miss_cache(UserDictSearchable *searchable) { + UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; + uint16 next = cache->tail; + + uint16 sig_len = kMaxLemmaSize / 4; + uint16 j = 0; + for (; j < sig_len; j++) { + cache->signatures[next][j] = searchable->signature[j]; + } + + if (++next >= kUserDictMissCacheSize) { + next -= kUserDictMissCacheSize; + } + if (next == cache->head) { + cache->head++; + if (cache->head >= kUserDictMissCacheSize) { + cache->head -= kUserDictMissCacheSize; + } + } + cache->tail = next; +} + +void UserDict::reset_miss_cache() { memset(miss_caches_, 0, sizeof(miss_caches_)); } + +void UserDict::cache_init() { + reset_cache(); + reset_miss_cache(); +} + +bool UserDict::cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length) { + bool hit = load_miss_cache(searchable); + if (hit) { + *offset = 0; + *length = 0; + return true; + } + hit = load_cache(searchable, offset, length); + if (hit) { + return true; + } + return false; +} + +void UserDict::cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length) { + switch (type) { + case USER_DICT_MISS_CACHE: + save_miss_cache(searchable); + break; + case USER_DICT_CACHE: + save_cache(searchable, offset, length); + break; + default: + break; + } +} + +#endif + +void UserDict::defragment(void) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + if (is_valid_state() == false) return; + // Fixup offsets_, set REMOVE flag to lemma's flag if needed + size_t first_freed = 0; + size_t first_inuse = 0; + while (first_freed < dict_info_.lemma_count) { + // Find first freed offset + while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 && first_freed < dict_info_.lemma_count) { + first_freed++; + } + if (first_freed < dict_info_.lemma_count) { + // Save REMOVE flag to lemma flag + int off = offsets_[first_freed]; + set_lemma_flag(off, kUserDictLemmaFlagRemove); + } else { + break; + } + // Find first inuse offse after first_freed + first_inuse = first_freed + 1; + while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) && (first_inuse < dict_info_.lemma_count)) { + // Save REMOVE flag to lemma flag + int off = offsets_[first_inuse]; + set_lemma_flag(off, kUserDictLemmaFlagRemove); + first_inuse++; + } + if (first_inuse >= dict_info_.lemma_count) { + break; + } + // Swap offsets_ + int tmp = offsets_[first_inuse]; + offsets_[first_inuse] = offsets_[first_freed]; + offsets_[first_freed] = tmp; + // Move scores_, no need to swap + tmp = scores_[first_inuse]; + scores_[first_inuse] = scores_[first_freed]; + scores_[first_freed] = tmp; + // Swap ids_ + LemmaIdType tmpid = ids_[first_inuse]; + ids_[first_inuse] = ids_[first_freed]; + ids_[first_freed] = tmpid; + // Go on + first_freed++; + } +#ifdef ___PREDICT_ENABLED___ + // Fixup predicts_ + first_freed = 0; + first_inuse = 0; + while (first_freed < dict_info_.lemma_count) { + // Find first freed offset + while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 && first_freed < dict_info_.lemma_count) { + first_freed++; + } + if (first_freed >= dict_info_.lemma_count) break; + // Find first inuse offse after first_freed + first_inuse = first_freed + 1; + while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove) && (first_inuse < dict_info_.lemma_count)) { + first_inuse++; + } + if (first_inuse >= dict_info_.lemma_count) { + break; + } + // Swap offsets_ + int tmp = predicts_[first_inuse]; + predicts_[first_inuse] = predicts_[first_freed]; + predicts_[first_freed] = tmp; + // Go on + first_freed++; + } +#endif + dict_info_.lemma_count = first_freed; + // Fixup lemmas_ + size_t begin = 0; + size_t end = 0; + size_t dst = 0; + int total_size = dict_info_.lemma_size + lemma_size_left_; + int total_count = dict_info_.lemma_count + lemma_count_left_; + size_t real_size = total_size - lemma_size_left_; + while (dst < real_size) { + unsigned char flag = get_lemma_flag(dst); + unsigned char nchr = get_lemma_nchar(dst); + if ((flag & kUserDictLemmaFlagRemove) == 0) { + dst += nchr * 4 + 2; + continue; + } + break; + } + if (dst >= real_size) return; + + end = dst; + while (end < real_size) { + begin = end + get_lemma_nchar(end) * 4 + 2; + repeat: + // not used any more + if (begin >= real_size) break; + unsigned char flag = get_lemma_flag(begin); + unsigned char nchr = get_lemma_nchar(begin); + if (flag & kUserDictLemmaFlagRemove) { + begin += nchr * 4 + 2; + goto repeat; + } + end = begin + nchr * 4 + 2; + while (end < real_size) { + unsigned char eflag = get_lemma_flag(end); + unsigned char enchr = get_lemma_nchar(end); + if ((eflag & kUserDictLemmaFlagRemove) == 0) { + end += enchr * 4 + 2; + continue; + } + break; + } + memmove(lemmas_ + dst, lemmas_ + begin, end - begin); + for (size_t j = 0; j < dict_info_.lemma_count; j++) { + if (offsets_[j] >= begin && offsets_[j] < end) { + offsets_[j] -= (begin - dst); + offsets_by_id_[ids_[j] - start_id_] = offsets_[j]; + } +#ifdef ___PREDICT_ENABLED___ + if (predicts_[j] >= begin && predicts_[j] < end) { + predicts_[j] -= (begin - dst); + } +#endif + } +#ifdef ___SYNC_ENABLED___ + for (size_t j = 0; j < dict_info_.sync_count; j++) { + if (syncs_[j] >= begin && syncs_[j] < end) { + syncs_[j] -= (begin - dst); + } + } +#endif + dst += (end - begin); + } + + dict_info_.free_count = 0; + dict_info_.free_size = 0; + dict_info_.lemma_size = dst; + lemma_size_left_ = total_size - dict_info_.lemma_size; + lemma_count_left_ = total_count - dict_info_.lemma_count; + + // XXX Without following code, + // offsets_by_id_ is not reordered. + // That's to say, all removed lemmas' ids are not collected back. + // There may not be room for addition of new lemmas due to + // offsests_by_id_ reason, although lemma_size_left_ is fixed. + // By default, we do want defrag as fast as possible, because + // during defrag procedure, other peers can not write new lemmas + // to user dictionary file. + // XXX If write-back is invoked immediately after + // this defragment, no need to fix up following in-mem data. + for (uint32 i = 0; i < dict_info_.lemma_count; i++) { + ids_[i] = start_id_ + i; + offsets_by_id_[i] = offsets_[i]; + } + + state_ = USER_DICT_DEFRAGMENTED; + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("defragment"); +#endif +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) { + if (is_valid_state() == false) return; + if (end > dict_info_.sync_count) end = dict_info_.sync_count; + memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2); + dict_info_.sync_count -= (end - start); + if (state_ < USER_DICT_SYNC_DIRTY) state_ = USER_DICT_SYNC_DIRTY; +} + +int UserDict::get_sync_count() { + if (is_valid_state() == false) return 0; + return dict_info_.sync_count; +} + +LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { + int again = 0; +begin: + LemmaIdType id; + uint32 *syncs_bak = syncs_; + syncs_ = NULL; + id = _put_lemma(lemma_str, splids, lemma_len, count, lmt); + syncs_ = syncs_bak; + if (id == 0 && again == 0) { + if ((dict_info_.limit_lemma_count > 0 && dict_info_.lemma_count >= dict_info_.limit_lemma_count) || (dict_info_.limit_lemma_size > 0 && dict_info_.lemma_size + (2 + (lemma_len << 2)) > dict_info_.limit_lemma_size)) { + // XXX Always reclaim and defrag in sync code path + // sync thread is background thread and ok with heavy work + reclaim(); + defragment(); + flush_cache(); + again = 1; + goto begin; + } + } + return id; +} + +int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 *lemmas, int len) { + int newly_added = 0; + + SpellingParser *spl_parser = new SpellingParser(); + if (!spl_parser) { + return 0; + } +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + char16 *ptr = lemmas; + + // Extract pinyin,words,frequence,last_mod_time + char16 *p = ptr, *py16 = ptr; + char16 *hz16 = NULL; + int py16_len = 0; + uint16 splid[kMaxLemmaSize]; + int splid_len = 0; + int hz16_len = 0; + char16 *fr16 = NULL; + int fr16_len = 0; + + while (p - ptr < len) { + // Pinyin + py16 = p; + splid_len = 0; + while (*p != 0x2c && (p - ptr) < len) { + if (*p == 0x20) splid_len++; + p++; + } + splid_len++; + if (p - ptr == len) break; + py16_len = p - py16; + if (kMaxLemmaSize < splid_len) { + break; + } + bool is_pre; + int splidl = spl_parser->splstr16_to_idxs_f(py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre); + if (splidl != splid_len) break; + // Phrase + hz16 = ++p; + while (*p != 0x2c && (p - ptr) < len) { + p++; + } + hz16_len = p - hz16; + if (hz16_len != splid_len) break; + // Frequency + fr16 = ++p; + fr16_len = 0; + while (*p != 0x2c && (p - ptr) < len) { + p++; + } + fr16_len = p - fr16; + uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len); + // Last modified time + fr16 = ++p; + fr16_len = 0; + while (*p != 0x3b && (p - ptr) < len) { + p++; + } + fr16_len = p - fr16; + uint64 last_mod = utf16le_atoll(fr16, fr16_len); + + put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod); + newly_added++; + + p++; + } + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("put_lemmas_no_sync_from_utf16le_string"); +#endif + return newly_added; +} + +int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(char16 *str, int size, int *count) { + int len = 0; + *count = 0; + + int left_len = size; + + if (is_valid_state() == false) return len; + + SpellingTrie *spl_trie = &SpellingTrie::get_instance(); + if (!spl_trie) { + return 0; + } + + uint32 i; + for (i = 0; i < dict_info_.sync_count; i++) { + int offset = syncs_[i]; + uint32 nchar = get_lemma_nchar(offset); + uint16 *spl = get_lemma_spell_ids(offset); + uint16 *wrd = get_lemma_word(offset); + int score = _get_lemma_score(wrd, spl, nchar); + + static char score_temp[32], *pscore_temp = score_temp; + static char16 temp[256], *ptemp = temp; + + pscore_temp = score_temp; + ptemp = temp; + + uint32 j; + // Add pinyin + for (j = 0; j < nchar; j++) { + int ret_len = spl_trie->get_spelling_str16(spl[j], ptemp, temp + sizeof(temp) - ptemp); + if (ret_len <= 0) break; + ptemp += ret_len; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ' '; + } else { + j = 0; + break; + } + } + if (j < nchar) { + continue; + } + ptemp--; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ','; + } else { + continue; + } + // Add phrase + for (j = 0; j < nchar; j++) { + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = wrd[j]; + } else { + break; + } + } + if (j < nchar) { + continue; + } + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ','; + } else { + continue; + } + // Add frequency + uint32 intf = extract_score_freq(score); + int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp); + if (ret_len <= 0) continue; + ptemp += ret_len; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ','; + } else { + continue; + } + // Add last modified time + uint64 last_mod = extract_score_lmt(score); + ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp); + if (ret_len <= 0) continue; + ptemp += ret_len; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ';'; + } else { + continue; + } + + // Write to string + int need_len = ptemp - temp; + if (need_len > left_len) break; + memcpy(str + len, temp, need_len * 2); + left_len -= need_len; + + len += need_len; + (*count)++; + } + + if (len > 0) { + if (state_ < USER_DICT_SYNC_DIRTY) state_ = USER_DICT_SYNC_DIRTY; + } + return len; +} + +#endif + +bool UserDict::state(UserDictStat *stat) { + if (is_valid_state() == false) return false; + if (!stat) return false; + stat->version = version_; + stat->file_name = dict_file_; + stat->load_time.tv_sec = load_time_.tv_sec; + stat->load_time.tv_usec = load_time_.tv_usec; + pthread_mutex_lock(&g_mutex_); + stat->last_update.tv_sec = g_last_update_.tv_sec; + stat->last_update.tv_usec = g_last_update_.tv_usec; + pthread_mutex_unlock(&g_mutex_); + stat->disk_size = get_dict_file_size(&dict_info_); + stat->lemma_count = dict_info_.lemma_count; + stat->lemma_size = dict_info_.lemma_size; + stat->delete_count = dict_info_.free_count; + stat->delete_size = dict_info_.free_size; +#ifdef ___SYNC_ENABLED___ + stat->sync_count = dict_info_.sync_count; +#endif + stat->limit_lemma_count = dict_info_.limit_lemma_count; + stat->limit_lemma_size = dict_info_.limit_lemma_size; + stat->reclaim_ratio = dict_info_.reclaim_ratio; + return true; +} + +void UserDict::set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio) { + dict_info_.limit_lemma_count = max_lemma_count; + dict_info_.limit_lemma_size = max_lemma_size; + if (reclaim_ratio > 100) reclaim_ratio = 100; + dict_info_.reclaim_ratio = reclaim_ratio; +} + +void UserDict::reclaim() { + if (is_valid_state() == false) return; + + switch (dict_info_.reclaim_ratio) { + case 0: + return; + case 100: + // TODO: CLEAR to be implemented + assert(false); + return; + default: + break; + } + + // XXX Reclaim is only based on count, not size + uint32 count = dict_info_.lemma_count; + int rc = count * dict_info_.reclaim_ratio / 100; + + UserDictScoreOffsetPair *score_offset_pairs = NULL; + score_offset_pairs = (UserDictScoreOffsetPair *)malloc(sizeof(UserDictScoreOffsetPair) * rc); + if (score_offset_pairs == NULL) { + return; + } + + for (int i = 0; i < rc; i++) { + int s = scores_[i]; + score_offset_pairs[i].score = s; + score_offset_pairs[i].offset_index = i; + } + + for (int i = (rc + 1) / 2; i >= 0; i--) shift_down(score_offset_pairs, i, rc); + + for (uint32 i = rc; i < dict_info_.lemma_count; i++) { + int s = scores_[i]; + if (s < score_offset_pairs[0].score) { + score_offset_pairs[0].score = s; + score_offset_pairs[0].offset_index = i; + shift_down(score_offset_pairs, 0, rc); + } + } + + for (int i = 0; i < rc; i++) { + int off = score_offset_pairs[i].offset_index; + remove_lemma_by_offset_index(off); + } + if (rc > 0) { + if (state_ < USER_DICT_OFFSET_DIRTY) state_ = USER_DICT_OFFSET_DIRTY; + } + + free(score_offset_pairs); +} + +inline void UserDict::swap(UserDictScoreOffsetPair *sop, int i, int j) { + int s = sop[i].score; + int p = sop[i].offset_index; + sop[i].score = sop[j].score; + sop[i].offset_index = sop[j].offset_index; + sop[j].score = s; + sop[j].offset_index = p; +} + +void UserDict::shift_down(UserDictScoreOffsetPair *sop, int i, int n) { + int par = i; + while (par < n) { + int left = par * 2 + 1; + int right = left + 1; + if (left >= n && right >= n) break; + if (right >= n) { + if (sop[left].score > sop[par].score) { + swap(sop, left, par); + par = left; + continue; + } + } else if (sop[left].score > sop[right].score && sop[left].score > sop[par].score) { + swap(sop, left, par); + par = left; + continue; + } else if (sop[right].score > sop[left].score && sop[right].score > sop[par].score) { + swap(sop, right, par); + par = right; + continue; + } + break; + } +} + +LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count) { return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL)); } + +LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + if (is_valid_state() == false) return 0; + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off != -1) { + int delta_score = count - scores_[off]; + dict_info_.total_nfreq += delta_score; + scores_[off] = build_score(lmt, count); + if (state_ < USER_DICT_SCORE_DIRTY) state_ = USER_DICT_SCORE_DIRTY; +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("_put_lemma(update)"); +#endif + return ids_[off]; + } else { + if ((dict_info_.limit_lemma_count > 0 && dict_info_.lemma_count >= dict_info_.limit_lemma_count) || (dict_info_.limit_lemma_size > 0 && dict_info_.lemma_size + (2 + (lemma_len << 2)) > dict_info_.limit_lemma_size)) { + // XXX Don't defragment here, it's too time-consuming. + return 0; + } + int flushed = 0; + if (lemma_count_left_ == 0 || lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) { + // XXX When there is no space for new lemma, we flush to disk + // flush_cache() may be called by upper user + // and better place shoule be found instead of here + flush_cache(); + flushed = 1; + // Or simply return and do nothing + // return 0; + } +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)"); +#endif + LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt); +#ifdef ___SYNC_ENABLED___ + if (syncs_ && id != 0) { + queue_lemma_for_sync(id); + } +#endif + return id; + } + return 0; +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::queue_lemma_for_sync(LemmaIdType id) { + if (dict_info_.sync_count < sync_count_size_) { + syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; + } else { + uint32 *syncs = (uint32 *)realloc(syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2); + if (syncs) { + sync_count_size_ += kUserDictPreAlloc; + syncs_ = syncs; + syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; + } + } +} +#endif + +LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + if (is_valid_state() == false) return 0; + if (is_valid_lemma_id(lemma_id) == false) return 0; + uint32 offset = offsets_by_id_[lemma_id - start_id_]; + uint8 lemma_len = get_lemma_nchar(offset); + char16 *lemma_str = get_lemma_word(offset); + uint16 *splids = get_lemma_spell_ids(offset); + + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off != -1) { + int score = scores_[off]; + int count = extract_score_freq(score); + uint64 lmt = extract_score_lmt(score); + if (count + delta_count > kUserDictMaxFrequency || count + delta_count < count) { + delta_count = kUserDictMaxFrequency - count; + } + count += delta_count; + dict_info_.total_nfreq += delta_count; + if (selected) { + lmt = time(NULL); + } + scores_[off] = build_score(lmt, count); + if (state_ < USER_DICT_SCORE_DIRTY) state_ = USER_DICT_SCORE_DIRTY; +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("update_lemma"); +#endif +#ifdef ___SYNC_ENABLED___ + queue_lemma_for_sync(ids_[off]); +#endif + return ids_[off]; + } + return 0; +} + +size_t UserDict::get_total_lemma_count() { return dict_info_.total_nfreq; } + +void UserDict::set_total_lemma_count_of_others(size_t count) { total_other_nfreq_ = count; } + +LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { + LemmaIdType id = get_max_lemma_id() + 1; + size_t offset = dict_info_.lemma_size; + if (offset > kUserDictOffsetMask) return 0; + + lemmas_[offset] = 0; + lemmas_[offset + 1] = (uint8)lemma_len; + for (size_t i = 0; i < lemma_len; i++) { + *((uint16 *)&lemmas_[offset + 2 + (i << 1)]) = splids[i]; + *((char16 *)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)]) = lemma_str[i]; + } + uint32 off = dict_info_.lemma_count; + offsets_[off] = offset; + scores_[off] = build_score(lmt, count); + ids_[off] = id; +#ifdef ___PREDICT_ENABLED___ + predicts_[off] = offset; +#endif + + offsets_by_id_[id - start_id_] = offset; + + dict_info_.lemma_count++; + dict_info_.lemma_size += (2 + (lemma_len << 2)); + lemma_count_left_--; + lemma_size_left_ -= (2 + (lemma_len << 2)); + + // Sort + + UserDictSearchable searchable; + prepare_locate(&searchable, splids, lemma_len); + + size_t i = 0; + while (i < off) { + offset = offsets_[i]; + uint32 nchar = get_lemma_nchar(offset); + uint16 *spl = get_lemma_spell_ids(offset); + + if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable)) break; + i++; + } + if (i != off) { + uint32 temp = offsets_[off]; + memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2); + offsets_[i] = temp; + + temp = scores_[off]; + memmove(scores_ + i + 1, scores_ + i, (off - i) << 2); + scores_[i] = temp; + + temp = ids_[off]; + memmove(ids_ + i + 1, ids_ + i, (off - i) << 2); + ids_[i] = temp; + } + +#ifdef ___PREDICT_ENABLED___ + uint32 j = 0; + uint16 *words_new = get_lemma_word(predicts_[off]); + j = locate_where_to_insert_in_predicts(words_new, lemma_len); + if (j != off) { + uint32 temp = predicts_[off]; + memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2); + predicts_[j] = temp; + } +#endif + + if (state_ < USER_DICT_LEMMA_DIRTY) state_ = USER_DICT_LEMMA_DIRTY; + +#ifdef ___CACHE_ENABLED___ + cache_init(); +#endif + + dict_info_.total_nfreq += count; + return id; +} +} // namespace ime_pinyin diff --git a/tests/main.cpp b/tests/main.cpp index 5284f0f..d5437bd 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -1,35 +1,35 @@ -#include "../src/include/pinyinime.h" -#include -#include -#include -#include - -std::string fromUtf16(const ime_pinyin::char16 *buf, size_t len) { - // 转换为标准 char16_t - std::u16string utf16Str(reinterpret_cast(buf), len); - std::wstring_convert, char16_t> convert; - return convert.to_bytes(utf16Str); -} - -int main() { - if (!ime_pinyin::im_open_decoder("./data/dict_pinyin.dat", "./data/user_dict.dat")) { - std::cout << "fany bug.\n"; - return 0; - } - - std::string pinyin = "ni'ma'si'le"; - pinyin = "ni'ma'mei'si"; - pinyin = "ni'shuo'ni'ma'ne"; - size_t cand_cnt = ime_pinyin::im_search(pinyin.c_str(), pinyin.size()); - ime_pinyin::char16 buf[256] = {0}; - std::string msg; - for (size_t i = 0; i < 100; ++i) { - ime_pinyin::im_get_candidate(i, buf, 255); - size_t len = 0; - while (buf[len] != 0 && len < 255) ++len; - msg.append(fromUtf16(buf, len) + " "); - } - std::cout << "候选项数量: " << cand_cnt << std::endl; - std::cout << "候选项本体: " << msg << std::endl; - return 0; -} +#include "../src/include/pinyinime.h" +#include +#include +#include +#include + +std::string fromUtf16(const ime_pinyin::char16 *buf, size_t len) { + // 转换为标准 char16_t + std::u16string utf16Str(reinterpret_cast(buf), len); + std::wstring_convert, char16_t> convert; + return convert.to_bytes(utf16Str); +} + +int main() { + if (!ime_pinyin::im_open_decoder("./data/dict_pinyin.dat", "./data/user_dict.dat")) { + std::cout << "fany bug.\n"; + return 0; + } + + std::string pinyin = "ni'ma'si'le"; + pinyin = "ni'ma'mei'si"; + pinyin = "ni'shuo'ni'ma'ne"; + size_t cand_cnt = ime_pinyin::im_search(pinyin.c_str(), pinyin.size()); + ime_pinyin::char16 buf[256] = {0}; + std::string msg; + for (size_t i = 0; i < 100; ++i) { + ime_pinyin::im_get_candidate(i, buf, 255); + size_t len = 0; + while (buf[len] != 0 && len < 255) ++len; + msg.append(fromUtf16(buf, len) + " "); + } + std::cout << "候选项数量: " << cand_cnt << std::endl; + std::cout << "候选项本体: " << msg << std::endl; + return 0; +}