mirror of
https://github.com/fanlumaster/googlepinyinime-rev.git
synced 2025-07-18 08:57:54 +08:00
adapt to win32
This commit is contained in:
@ -1,55 +1,59 @@
|
||||
/*
|
||||
* Copyright (C) 2009 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include "../src/include/dicttrie.h"
|
||||
|
||||
using namespace ime_pinyin;
|
||||
|
||||
/**
|
||||
* Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined
|
||||
* in dictdef.h.
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
DictTrie* dict_trie = new DictTrie();
|
||||
bool success;
|
||||
if (argc >= 3)
|
||||
success = dict_trie->build_dict(argv[1], argv[2]);
|
||||
else
|
||||
success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt", "../data/valid_utf16.txt");
|
||||
|
||||
if (success) {
|
||||
printf("Build dictionary successfully.\n");
|
||||
} else {
|
||||
printf("Build dictionary unsuccessfully.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
success = dict_trie->save_dict("./dict/dict_pinyin.dat");
|
||||
|
||||
if (success) {
|
||||
printf("Save dictionary successfully.\n");
|
||||
} else {
|
||||
printf("Save dictionary unsuccessfully.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
/*
|
||||
* Copyright (C) 2009 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#ifdef _WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "../src/include/dicttrie.h"
|
||||
|
||||
using namespace ime_pinyin;
|
||||
|
||||
/**
|
||||
* Build binary dictionary model. Make sure that ___BUILD_MODEL___ is defined
|
||||
* in dictdef.h.
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
DictTrie* dict_trie = new DictTrie();
|
||||
bool success;
|
||||
if (argc >= 3)
|
||||
success = dict_trie->build_dict(argv[1], argv[2]);
|
||||
else
|
||||
success = dict_trie->build_dict("../data/rawdict_utf16_65105_freq.txt", "../data/valid_utf16.txt");
|
||||
|
||||
if (success) {
|
||||
printf("Build dictionary successfully.\n");
|
||||
} else {
|
||||
printf("Build dictionary unsuccessfully.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
success = dict_trie->save_dict("./dict/dict_pinyin.dat");
|
||||
|
||||
if (success) {
|
||||
printf("Save dictionary successfully.\n");
|
||||
} else {
|
||||
printf("Save dictionary unsuccessfully.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
46
command/lcompile.sh → command/scripts/lcompile.sh
Executable file → Normal file
46
command/lcompile.sh → command/scripts/lcompile.sh
Executable file → Normal file
@ -1,23 +1,23 @@
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
fi
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
fi
|
90
llaunch.sh → command/scripts/llaunch.sh
Executable file → Normal file
90
llaunch.sh → command/scripts/llaunch.sh
Executable file → Normal file
@ -1,45 +1,45 @@
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
if [ $? -eq 0 ]; then
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
echo "start running as follows..."
|
||||
echo "=================================================="
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
# execute the binary file
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
if [ $? -eq 0 ]; then
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
echo "start running as follows..."
|
||||
echo "=================================================="
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
# execute the binary file
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
||||
fi
|
||||
fi
|
36
lrun.sh → command/scripts/lrun.sh
Executable file → Normal file
36
lrun.sh → command/scripts/lrun.sh
Executable file → Normal file
@ -1,18 +1,18 @@
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
26
scripts/lcompile.ps1
Normal file
26
scripts/lcompile.ps1
Normal file
@ -0,0 +1,26 @@
|
||||
# generate compile to exe files
|
||||
$currentDirectory = Get-Location
|
||||
$cmakeListsPath = Join-Path -Path $currentDirectory -ChildPath "CMakeLists.txt"
|
||||
|
||||
if (-not (Test-Path $cmakeListsPath))
|
||||
{
|
||||
Write-Host("No CMakeLists.txt in current directory, please check.")
|
||||
return
|
||||
}
|
||||
|
||||
Write-Host "Start generating and compiling..."
|
||||
|
||||
$buildFolderPath = ".\build"
|
||||
|
||||
if (-not (Test-Path $buildFolderPath))
|
||||
{
|
||||
New-Item -ItemType Directory -Path $buildFolderPath | Out-Null
|
||||
Write-Host "build folder created."
|
||||
}
|
||||
|
||||
cmake -G "Visual Studio 17 2022" -A x64 -S . -B ./build/
|
||||
|
||||
if ($LASTEXITCODE -eq 0)
|
||||
{
|
||||
cmake --build ./build/ --config DEBUG
|
||||
}
|
46
lcompile.sh → scripts/lcompile.sh
Executable file → Normal file
46
lcompile.sh → scripts/lcompile.sh
Executable file → Normal file
@ -1,23 +1,23 @@
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
fi
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
fi
|
45
scripts/llaunch.ps1
Normal file
45
scripts/llaunch.ps1
Normal file
@ -0,0 +1,45 @@
|
||||
#
|
||||
# generate, compile and run exe files
|
||||
#
|
||||
function getExePathFromCMakeLists() {
|
||||
$content = Get-Content -Raw -Path "./CMakeLists.txt"
|
||||
$exePath = ""
|
||||
foreach ($line in $content -split "`n") {
|
||||
if ($line -match 'set\(MY_EXECUTABLE_NAME[^\"]*\"([^\"]+)\"') {
|
||||
$exeName = $matches[1]
|
||||
$exePath = "./build/bin/Debug/$exeName" + ".exe"
|
||||
break
|
||||
}
|
||||
}
|
||||
return $exePath
|
||||
}
|
||||
|
||||
$currentDirectory = Get-Location
|
||||
$cmakeListsPath = Join-Path -Path $currentDirectory -ChildPath "CMakeLists.txt"
|
||||
|
||||
if (-not (Test-Path $cmakeListsPath)) {
|
||||
Write-Host("No CMakeLists.txt in current directory, please check.")
|
||||
return
|
||||
}
|
||||
|
||||
Write-Host "Start generating and compiling..."
|
||||
|
||||
$buildFolderPath = ".\build"
|
||||
|
||||
if (-not (Test-Path $buildFolderPath)) {
|
||||
New-Item -ItemType Directory -Path $buildFolderPath | Out-Null
|
||||
Write-Host "build folder created."
|
||||
}
|
||||
|
||||
cmake -G "Visual Studio 17 2022" -A x64 -S . -B ./build/
|
||||
|
||||
if ($LASTEXITCODE -eq 0) {
|
||||
cmake --build ./build/ --config DEBUG
|
||||
if ($LASTEXITCODE -eq 0) {
|
||||
$exePath = getExePathFromCMakeLists
|
||||
Write-Host "start running as follows..."
|
||||
Write-Host "=================================================="
|
||||
Invoke-Expression $exePath
|
||||
}
|
||||
}
|
||||
|
90
command/llaunch.sh → scripts/llaunch.sh
Executable file → Normal file
90
command/llaunch.sh → scripts/llaunch.sh
Executable file → Normal file
@ -1,45 +1,45 @@
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
if [ $? -eq 0 ]; then
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
echo "start running as follows..."
|
||||
echo "=================================================="
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
# execute the binary file
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
#!/bin/bash
|
||||
currentDirectory=$(pwd)
|
||||
cmakeListsPath="${currentDirectory}/CMakeLists.txt"
|
||||
|
||||
if [ ! -f "$cmakeListsPath" ]; then
|
||||
echo "No CMakeLists.txt in current directory, please check."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Start generating and compiling..."
|
||||
|
||||
buildFolderPath="./build"
|
||||
|
||||
if [ ! -d "$buildFolderPath" ]; then
|
||||
mkdir -p "$buildFolderPath"
|
||||
echo "build folder created."
|
||||
fi
|
||||
|
||||
cmake -G "Unix Makefiles" -D CMAKE_CXX_COMPILER=/usr/bin/g++ -S . -B ./build/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
cmake --build ./build/ --config DEBUG
|
||||
if [ $? -eq 0 ]; then
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
echo "start running as follows..."
|
||||
echo "=================================================="
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
# execute the binary file
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
||||
fi
|
||||
fi
|
20
scripts/lrun.ps1
Normal file
20
scripts/lrun.ps1
Normal file
@ -0,0 +1,20 @@
|
||||
#
|
||||
# run exe file that has already been compiled before
|
||||
#
|
||||
function getExePathFromCMakeLists() {
|
||||
$content = Get-Content -Raw -Path "./CMakeLists.txt"
|
||||
$exePath = ""
|
||||
foreach ($line in $content -split "`n") {
|
||||
if ($line -match 'set\(MY_EXECUTABLE_NAME[^\"]*\"([^\"]+)\"') {
|
||||
$exeName = $matches[1]
|
||||
$exePath = "./build/bin/Debug/$exeName" + ".exe"
|
||||
break
|
||||
}
|
||||
}
|
||||
return $exePath
|
||||
}
|
||||
|
||||
$exePath = getExePathFromCMakeLists
|
||||
#Write-Host "start running as follows..."
|
||||
#Write-Host "=================================================="
|
||||
Invoke-Expression $exePath
|
36
command/lrun.sh → scripts/lrun.sh
Executable file → Normal file
36
command/lrun.sh → scripts/lrun.sh
Executable file → Normal file
@ -1,18 +1,18 @@
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
||||
content=$(<"./CMakeLists.txt")
|
||||
exePath=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == "set(MY_EXECUTABLE_NAME"* ]]; then
|
||||
pattern="\"([^\"]+)\""
|
||||
if [[ $line =~ $pattern ]]; then
|
||||
contentInParentheses="${BASH_REMATCH[1]}"
|
||||
result=($contentInParentheses)
|
||||
exePath="./build/bin/${result[0]}"
|
||||
fi
|
||||
fi
|
||||
done <<<"$content"
|
||||
|
||||
if [ -n "$exePath" ]; then
|
||||
$exePath
|
||||
else
|
||||
echo "cannot find executable file path"
|
||||
fi
|
@ -1,390 +1,396 @@
|
||||
/*
|
||||
* Copyright (C) 2009 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PINYINIME_INCLUDE_USERDICT_H__
|
||||
#define PINYINIME_INCLUDE_USERDICT_H__
|
||||
|
||||
#define ___CACHE_ENABLED___
|
||||
#define ___SYNC_ENABLED___
|
||||
#define ___PREDICT_ENABLED___
|
||||
|
||||
// Debug performance for operations
|
||||
// #define ___DEBUG_PERF___
|
||||
|
||||
#include <pthread.h>
|
||||
#include "atomdictbase.h"
|
||||
|
||||
namespace ime_pinyin {
|
||||
|
||||
class UserDict : public AtomDictBase {
|
||||
public:
|
||||
UserDict();
|
||||
~UserDict();
|
||||
|
||||
bool load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id);
|
||||
|
||||
bool close_dict();
|
||||
|
||||
size_t number_of_lemmas();
|
||||
|
||||
void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
|
||||
|
||||
MileStoneHandle extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num);
|
||||
|
||||
size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max);
|
||||
|
||||
uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
|
||||
|
||||
uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid);
|
||||
|
||||
size_t predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used);
|
||||
|
||||
// Full spelling ids are required
|
||||
LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count);
|
||||
|
||||
LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected);
|
||||
|
||||
LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
|
||||
|
||||
LmaScoreType get_lemma_score(LemmaIdType lemma_id);
|
||||
|
||||
LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
|
||||
|
||||
bool remove_lemma(LemmaIdType lemma_id);
|
||||
|
||||
size_t get_total_lemma_count();
|
||||
void set_total_lemma_count_of_others(size_t count);
|
||||
|
||||
void flush_cache();
|
||||
|
||||
void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio);
|
||||
|
||||
void reclaim();
|
||||
|
||||
void defragment();
|
||||
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
void clear_sync_lemmas(unsigned int start, unsigned int end);
|
||||
|
||||
int get_sync_count();
|
||||
|
||||
LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt);
|
||||
/**
|
||||
* Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
|
||||
*
|
||||
* @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
|
||||
* @param len length of lemmas string in UTF-16LE
|
||||
* @return newly added lemma count
|
||||
*/
|
||||
int put_lemmas_no_sync_from_utf16le_string(char16 *lemmas, int len);
|
||||
|
||||
/**
|
||||
* Get lemmas need sync to a UTF-16LE string of above format.
|
||||
* Note: input buffer (str) must not be too small. If str is too small to
|
||||
* contain single one lemma, there might be a dead loop.
|
||||
*
|
||||
* @param str buffer to write lemmas
|
||||
* @param size buffer size in UTF-16LE
|
||||
* @param count output value of lemma returned
|
||||
* @return UTF-16LE string length
|
||||
*/
|
||||
int get_sync_lemmas_in_utf16le_string_from_beginning(char16 *str, int size, int *count);
|
||||
|
||||
#endif
|
||||
|
||||
struct UserDictStat {
|
||||
uint32 version;
|
||||
const char *file_name;
|
||||
struct timeval load_time;
|
||||
struct timeval last_update;
|
||||
uint32 disk_size;
|
||||
uint32 lemma_count;
|
||||
uint32 lemma_size;
|
||||
uint32 delete_count;
|
||||
uint32 delete_size;
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
uint32 sync_count;
|
||||
#endif
|
||||
uint32 reclaim_ratio;
|
||||
uint32 limit_lemma_count;
|
||||
uint32 limit_lemma_size;
|
||||
};
|
||||
|
||||
bool state(UserDictStat *stat);
|
||||
|
||||
private:
|
||||
uint32 total_other_nfreq_;
|
||||
struct timeval load_time_;
|
||||
LemmaIdType start_id_;
|
||||
uint32 version_;
|
||||
uint8 *lemmas_;
|
||||
|
||||
// In-Memory-Only flag for each lemma
|
||||
static const uint8 kUserDictLemmaFlagRemove = 1;
|
||||
// Inuse lemmas' offset
|
||||
uint32 *offsets_;
|
||||
// Highest bit in offset tells whether corresponding lemma is removed
|
||||
static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
|
||||
// Maximum possible for the offset
|
||||
static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
|
||||
// Bit width for last modified time, from 1 to 16
|
||||
static const uint32 kUserDictLMTBitWidth = 16;
|
||||
// Granularity for last modified time in second
|
||||
static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
|
||||
// Maximum frequency count
|
||||
static const uint16 kUserDictMaxFrequency = 0xFFFF;
|
||||
|
||||
#define COARSE_UTC(year, month, day, hour, minute, second) ((year - 1970) * 365 * 24 * 60 * 60 + (month - 1) * 30 * 24 * 60 * 60 + (day - 1) * 24 * 60 * 60 + (hour - 0) * 60 * 60 + (minute - 0) * 60 + (second - 0))
|
||||
static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
|
||||
|
||||
// Correspond to offsets_
|
||||
uint32 *scores_;
|
||||
// Following two fields are only valid in memory
|
||||
uint32 *ids_;
|
||||
#ifdef ___PREDICT_ENABLED___
|
||||
uint32 *predicts_;
|
||||
#endif
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
uint32 *syncs_;
|
||||
size_t sync_count_size_;
|
||||
#endif
|
||||
uint32 *offsets_by_id_;
|
||||
|
||||
size_t lemma_count_left_;
|
||||
size_t lemma_size_left_;
|
||||
|
||||
const char *dict_file_;
|
||||
|
||||
// Be sure size is 4xN
|
||||
struct UserDictInfo {
|
||||
// When limitation reached, how much percentage will be reclaimed (1 ~ 100)
|
||||
uint32 reclaim_ratio;
|
||||
// maximum lemma count, 0 means no limitation
|
||||
uint32 limit_lemma_count;
|
||||
// Maximum lemma size, it's different from
|
||||
// whole disk file size or in-mem dict size
|
||||
// 0 means no limitation
|
||||
uint32 limit_lemma_size;
|
||||
// Total lemma count including deleted and inuse
|
||||
// Also indicate offsets_ size
|
||||
uint32 lemma_count;
|
||||
// Total size of lemmas including used and freed
|
||||
uint32 lemma_size;
|
||||
// Freed lemma count
|
||||
uint32 free_count;
|
||||
// Freed lemma size in byte
|
||||
uint32 free_size;
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
uint32 sync_count;
|
||||
#endif
|
||||
int32 total_nfreq;
|
||||
} dict_info_;
|
||||
|
||||
static const uint32 kUserDictVersion = 0x0ABCDEF0;
|
||||
|
||||
static const uint32 kUserDictPreAlloc = 32;
|
||||
static const uint32 kUserDictAverageNchar = 8;
|
||||
|
||||
enum UserDictState {
|
||||
// Keep in order
|
||||
USER_DICT_NONE = 0,
|
||||
USER_DICT_SYNC,
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
USER_DICT_SYNC_DIRTY,
|
||||
#endif
|
||||
USER_DICT_SCORE_DIRTY,
|
||||
USER_DICT_OFFSET_DIRTY,
|
||||
USER_DICT_LEMMA_DIRTY,
|
||||
|
||||
USER_DICT_DEFRAGMENTED,
|
||||
} state_;
|
||||
|
||||
struct UserDictSearchable {
|
||||
uint16 splids_len;
|
||||
uint16 splid_start[kMaxLemmaSize];
|
||||
uint16 splid_count[kMaxLemmaSize];
|
||||
// Compact inital letters for both FuzzyCompareSpellId and cache system
|
||||
uint32 signature[kMaxLemmaSize / 4];
|
||||
};
|
||||
|
||||
#ifdef ___CACHE_ENABLED___
|
||||
enum UserDictCacheType {
|
||||
USER_DICT_CACHE,
|
||||
USER_DICT_MISS_CACHE,
|
||||
};
|
||||
|
||||
static const int kUserDictCacheSize = 4;
|
||||
static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
|
||||
|
||||
struct UserDictMissCache {
|
||||
uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
|
||||
uint16 head, tail;
|
||||
} miss_caches_[kMaxLemmaSize];
|
||||
|
||||
struct UserDictCache {
|
||||
uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
|
||||
uint32 offsets[kUserDictCacheSize];
|
||||
uint32 lengths[kUserDictCacheSize];
|
||||
// Ring buffer
|
||||
uint16 head, tail;
|
||||
} caches_[kMaxLemmaSize];
|
||||
|
||||
void cache_init();
|
||||
|
||||
void cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length);
|
||||
|
||||
bool cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length);
|
||||
|
||||
bool load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length);
|
||||
|
||||
void save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length);
|
||||
|
||||
void reset_cache();
|
||||
|
||||
bool load_miss_cache(UserDictSearchable *searchable);
|
||||
|
||||
void save_miss_cache(UserDictSearchable *searchable);
|
||||
|
||||
void reset_miss_cache();
|
||||
#endif
|
||||
|
||||
LmaScoreType translate_score(int f);
|
||||
|
||||
int extract_score_freq(int raw_score);
|
||||
|
||||
uint64 extract_score_lmt(int raw_score);
|
||||
|
||||
inline int build_score(uint64 lmt, int freq);
|
||||
|
||||
inline int64 utf16le_atoll(uint16 *s, int len);
|
||||
|
||||
inline int utf16le_lltoa(int64 v, uint16 *s, int size);
|
||||
|
||||
LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt);
|
||||
|
||||
size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool *need_extend);
|
||||
|
||||
int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
|
||||
|
||||
int _get_lemma_score(LemmaIdType lemma_id);
|
||||
|
||||
int is_fuzzy_prefix_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable);
|
||||
|
||||
bool is_prefix_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable);
|
||||
|
||||
uint32 get_dict_file_size(UserDictInfo *info);
|
||||
|
||||
bool reset(const char *file);
|
||||
|
||||
bool validate(const char *file);
|
||||
|
||||
bool load(const char *file, LemmaIdType start_id);
|
||||
|
||||
bool is_valid_state();
|
||||
|
||||
bool is_valid_lemma_id(LemmaIdType id);
|
||||
|
||||
LemmaIdType get_max_lemma_id();
|
||||
|
||||
void set_lemma_flag(uint32 offset, uint8 flag);
|
||||
|
||||
char get_lemma_flag(uint32 offset);
|
||||
|
||||
char get_lemma_nchar(uint32 offset);
|
||||
|
||||
uint16 *get_lemma_spell_ids(uint32 offset);
|
||||
|
||||
uint16 *get_lemma_word(uint32 offset);
|
||||
|
||||
// Prepare searchable to fasten locate process
|
||||
void prepare_locate(UserDictSearchable *searchable, const uint16 *splids, uint16 len);
|
||||
|
||||
// Compare initial letters only
|
||||
int32 fuzzy_compare_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable);
|
||||
|
||||
// Compare exactly two spell ids
|
||||
// First argument must be a full id spell id
|
||||
bool equal_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable);
|
||||
|
||||
// Find first item by initial letters
|
||||
int32 locate_first_in_offsets(const UserDictSearchable *searchable);
|
||||
|
||||
LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt);
|
||||
|
||||
// Check if a lemma is in dictionary
|
||||
int32 locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len);
|
||||
|
||||
bool remove_lemma_by_offset_index(int offset_index);
|
||||
#ifdef ___PREDICT_ENABLED___
|
||||
uint32 locate_where_to_insert_in_predicts(const uint16 *words, int lemma_len);
|
||||
|
||||
int32 locate_first_in_predicts(const uint16 *words, int lemma_len);
|
||||
|
||||
void remove_lemma_from_predict_list(uint32 offset);
|
||||
#endif
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
void queue_lemma_for_sync(LemmaIdType id);
|
||||
|
||||
void remove_lemma_from_sync_list(uint32 offset);
|
||||
|
||||
void write_back_sync(int fd);
|
||||
#endif
|
||||
void write_back_score(int fd);
|
||||
void write_back_offset(int fd);
|
||||
void write_back_lemma(int fd);
|
||||
void write_back_all(int fd);
|
||||
void write_back();
|
||||
|
||||
struct UserDictScoreOffsetPair {
|
||||
int score;
|
||||
uint32 offset_index;
|
||||
};
|
||||
|
||||
inline void swap(UserDictScoreOffsetPair *sop, int i, int j);
|
||||
|
||||
void shift_down(UserDictScoreOffsetPair *sop, int i, int n);
|
||||
|
||||
// On-disk format for each lemma
|
||||
// +-------------+
|
||||
// | Version (4) |
|
||||
// +-------------+
|
||||
// +-----------+-----------+--------------------+-------------------+
|
||||
// | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
|
||||
// +-----------+-----------+--------------------+-------------------+
|
||||
// ...
|
||||
// +-----------------------+ +-------------+ <---Offset of offset
|
||||
// | Offset1 by_splids (4) | ... | OffsetN (4) |
|
||||
// +-----------------------+ +-------------+
|
||||
#ifdef ___PREDICT_ENABLED___
|
||||
// +----------------------+ +-------------+
|
||||
// | Offset1 by_lemma (4) | ... | OffsetN (4) |
|
||||
// +----------------------+ +-------------+
|
||||
#endif
|
||||
// +------------+ +------------+
|
||||
// | Score1 (4) | ... | ScoreN (4) |
|
||||
// +------------+ +------------+
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
// +-------------+ +-------------+
|
||||
// | NewAdd1 (4) | ... | NewAddN (4) |
|
||||
// +-------------+ +-------------+
|
||||
#endif
|
||||
// +----------------+
|
||||
// | Dict Info (4x) |
|
||||
// +----------------+
|
||||
};
|
||||
} // namespace ime_pinyin
|
||||
|
||||
#endif
|
||||
/*
|
||||
* Copyright (C) 2009 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PINYINIME_INCLUDE_USERDICT_H__
|
||||
#define PINYINIME_INCLUDE_USERDICT_H__
|
||||
|
||||
#define ___CACHE_ENABLED___
|
||||
#define ___SYNC_ENABLED___
|
||||
#define ___PREDICT_ENABLED___
|
||||
|
||||
// Debug performance for operations
|
||||
// #define ___DEBUG_PERF___
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <time.h>
|
||||
#include <winsock.h> // timeval
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "atomdictbase.h"
|
||||
|
||||
namespace ime_pinyin {
|
||||
|
||||
class UserDict : public AtomDictBase {
|
||||
public:
|
||||
UserDict();
|
||||
~UserDict();
|
||||
|
||||
bool load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id);
|
||||
|
||||
bool close_dict();
|
||||
|
||||
size_t number_of_lemmas();
|
||||
|
||||
void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
|
||||
|
||||
MileStoneHandle extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num);
|
||||
|
||||
size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max);
|
||||
|
||||
uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
|
||||
|
||||
uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid);
|
||||
|
||||
size_t predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used);
|
||||
|
||||
// Full spelling ids are required
|
||||
LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count);
|
||||
|
||||
LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected);
|
||||
|
||||
LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
|
||||
|
||||
LmaScoreType get_lemma_score(LemmaIdType lemma_id);
|
||||
|
||||
LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
|
||||
|
||||
bool remove_lemma(LemmaIdType lemma_id);
|
||||
|
||||
size_t get_total_lemma_count();
|
||||
void set_total_lemma_count_of_others(size_t count);
|
||||
|
||||
void flush_cache();
|
||||
|
||||
void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio);
|
||||
|
||||
void reclaim();
|
||||
|
||||
void defragment();
|
||||
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
void clear_sync_lemmas(unsigned int start, unsigned int end);
|
||||
|
||||
int get_sync_count();
|
||||
|
||||
LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt);
|
||||
/**
|
||||
* Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
|
||||
*
|
||||
* @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
|
||||
* @param len length of lemmas string in UTF-16LE
|
||||
* @return newly added lemma count
|
||||
*/
|
||||
int put_lemmas_no_sync_from_utf16le_string(char16 *lemmas, int len);
|
||||
|
||||
/**
|
||||
* Get lemmas need sync to a UTF-16LE string of above format.
|
||||
* Note: input buffer (str) must not be too small. If str is too small to
|
||||
* contain single one lemma, there might be a dead loop.
|
||||
*
|
||||
* @param str buffer to write lemmas
|
||||
* @param size buffer size in UTF-16LE
|
||||
* @param count output value of lemma returned
|
||||
* @return UTF-16LE string length
|
||||
*/
|
||||
int get_sync_lemmas_in_utf16le_string_from_beginning(char16 *str, int size, int *count);
|
||||
|
||||
#endif
|
||||
|
||||
struct UserDictStat {
|
||||
uint32 version;
|
||||
const char *file_name;
|
||||
struct timeval load_time;
|
||||
struct timeval last_update;
|
||||
uint32 disk_size;
|
||||
uint32 lemma_count;
|
||||
uint32 lemma_size;
|
||||
uint32 delete_count;
|
||||
uint32 delete_size;
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
uint32 sync_count;
|
||||
#endif
|
||||
uint32 reclaim_ratio;
|
||||
uint32 limit_lemma_count;
|
||||
uint32 limit_lemma_size;
|
||||
};
|
||||
|
||||
bool state(UserDictStat *stat);
|
||||
|
||||
private:
|
||||
uint32 total_other_nfreq_;
|
||||
struct timeval load_time_;
|
||||
LemmaIdType start_id_;
|
||||
uint32 version_;
|
||||
uint8 *lemmas_;
|
||||
|
||||
// In-Memory-Only flag for each lemma
|
||||
static const uint8 kUserDictLemmaFlagRemove = 1;
|
||||
// Inuse lemmas' offset
|
||||
uint32 *offsets_;
|
||||
// Highest bit in offset tells whether corresponding lemma is removed
|
||||
static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
|
||||
// Maximum possible for the offset
|
||||
static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
|
||||
// Bit width for last modified time, from 1 to 16
|
||||
static const uint32 kUserDictLMTBitWidth = 16;
|
||||
// Granularity for last modified time in second
|
||||
static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
|
||||
// Maximum frequency count
|
||||
static const uint16 kUserDictMaxFrequency = 0xFFFF;
|
||||
|
||||
#define COARSE_UTC(year, month, day, hour, minute, second) ((year - 1970) * 365 * 24 * 60 * 60 + (month - 1) * 30 * 24 * 60 * 60 + (day - 1) * 24 * 60 * 60 + (hour - 0) * 60 * 60 + (minute - 0) * 60 + (second - 0))
|
||||
static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
|
||||
|
||||
// Correspond to offsets_
|
||||
uint32 *scores_;
|
||||
// Following two fields are only valid in memory
|
||||
uint32 *ids_;
|
||||
#ifdef ___PREDICT_ENABLED___
|
||||
uint32 *predicts_;
|
||||
#endif
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
uint32 *syncs_;
|
||||
size_t sync_count_size_;
|
||||
#endif
|
||||
uint32 *offsets_by_id_;
|
||||
|
||||
size_t lemma_count_left_;
|
||||
size_t lemma_size_left_;
|
||||
|
||||
const char *dict_file_;
|
||||
|
||||
// Be sure size is 4xN
|
||||
struct UserDictInfo {
|
||||
// When limitation reached, how much percentage will be reclaimed (1 ~ 100)
|
||||
uint32 reclaim_ratio;
|
||||
// maximum lemma count, 0 means no limitation
|
||||
uint32 limit_lemma_count;
|
||||
// Maximum lemma size, it's different from
|
||||
// whole disk file size or in-mem dict size
|
||||
// 0 means no limitation
|
||||
uint32 limit_lemma_size;
|
||||
// Total lemma count including deleted and inuse
|
||||
// Also indicate offsets_ size
|
||||
uint32 lemma_count;
|
||||
// Total size of lemmas including used and freed
|
||||
uint32 lemma_size;
|
||||
// Freed lemma count
|
||||
uint32 free_count;
|
||||
// Freed lemma size in byte
|
||||
uint32 free_size;
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
uint32 sync_count;
|
||||
#endif
|
||||
int32 total_nfreq;
|
||||
} dict_info_;
|
||||
|
||||
static const uint32 kUserDictVersion = 0x0ABCDEF0;
|
||||
|
||||
static const uint32 kUserDictPreAlloc = 32;
|
||||
static const uint32 kUserDictAverageNchar = 8;
|
||||
|
||||
enum UserDictState {
|
||||
// Keep in order
|
||||
USER_DICT_NONE = 0,
|
||||
USER_DICT_SYNC,
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
USER_DICT_SYNC_DIRTY,
|
||||
#endif
|
||||
USER_DICT_SCORE_DIRTY,
|
||||
USER_DICT_OFFSET_DIRTY,
|
||||
USER_DICT_LEMMA_DIRTY,
|
||||
|
||||
USER_DICT_DEFRAGMENTED,
|
||||
} state_;
|
||||
|
||||
struct UserDictSearchable {
|
||||
uint16 splids_len;
|
||||
uint16 splid_start[kMaxLemmaSize];
|
||||
uint16 splid_count[kMaxLemmaSize];
|
||||
// Compact inital letters for both FuzzyCompareSpellId and cache system
|
||||
uint32 signature[kMaxLemmaSize / 4];
|
||||
};
|
||||
|
||||
#ifdef ___CACHE_ENABLED___
|
||||
enum UserDictCacheType {
|
||||
USER_DICT_CACHE,
|
||||
USER_DICT_MISS_CACHE,
|
||||
};
|
||||
|
||||
static const int kUserDictCacheSize = 4;
|
||||
static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
|
||||
|
||||
struct UserDictMissCache {
|
||||
uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
|
||||
uint16 head, tail;
|
||||
} miss_caches_[kMaxLemmaSize];
|
||||
|
||||
struct UserDictCache {
|
||||
uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
|
||||
uint32 offsets[kUserDictCacheSize];
|
||||
uint32 lengths[kUserDictCacheSize];
|
||||
// Ring buffer
|
||||
uint16 head, tail;
|
||||
} caches_[kMaxLemmaSize];
|
||||
|
||||
void cache_init();
|
||||
|
||||
void cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length);
|
||||
|
||||
bool cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length);
|
||||
|
||||
bool load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length);
|
||||
|
||||
void save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length);
|
||||
|
||||
void reset_cache();
|
||||
|
||||
bool load_miss_cache(UserDictSearchable *searchable);
|
||||
|
||||
void save_miss_cache(UserDictSearchable *searchable);
|
||||
|
||||
void reset_miss_cache();
|
||||
#endif
|
||||
|
||||
LmaScoreType translate_score(int f);
|
||||
|
||||
int extract_score_freq(int raw_score);
|
||||
|
||||
uint64 extract_score_lmt(int raw_score);
|
||||
|
||||
inline int build_score(uint64 lmt, int freq);
|
||||
|
||||
inline int64 utf16le_atoll(uint16 *s, int len);
|
||||
|
||||
inline int utf16le_lltoa(int64 v, uint16 *s, int size);
|
||||
|
||||
LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt);
|
||||
|
||||
size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool *need_extend);
|
||||
|
||||
int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
|
||||
|
||||
int _get_lemma_score(LemmaIdType lemma_id);
|
||||
|
||||
int is_fuzzy_prefix_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable);
|
||||
|
||||
bool is_prefix_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable);
|
||||
|
||||
uint32 get_dict_file_size(UserDictInfo *info);
|
||||
|
||||
bool reset(const char *file);
|
||||
|
||||
bool validate(const char *file);
|
||||
|
||||
bool load(const char *file, LemmaIdType start_id);
|
||||
|
||||
bool is_valid_state();
|
||||
|
||||
bool is_valid_lemma_id(LemmaIdType id);
|
||||
|
||||
LemmaIdType get_max_lemma_id();
|
||||
|
||||
void set_lemma_flag(uint32 offset, uint8 flag);
|
||||
|
||||
char get_lemma_flag(uint32 offset);
|
||||
|
||||
char get_lemma_nchar(uint32 offset);
|
||||
|
||||
uint16 *get_lemma_spell_ids(uint32 offset);
|
||||
|
||||
uint16 *get_lemma_word(uint32 offset);
|
||||
|
||||
// Prepare searchable to fasten locate process
|
||||
void prepare_locate(UserDictSearchable *searchable, const uint16 *splids, uint16 len);
|
||||
|
||||
// Compare initial letters only
|
||||
int32 fuzzy_compare_spell_id(const uint16 *id1, uint16 len1, const UserDictSearchable *searchable);
|
||||
|
||||
// Compare exactly two spell ids
|
||||
// First argument must be a full id spell id
|
||||
bool equal_spell_id(const uint16 *fullids, uint16 fulllen, const UserDictSearchable *searchable);
|
||||
|
||||
// Find first item by initial letters
|
||||
int32 locate_first_in_offsets(const UserDictSearchable *searchable);
|
||||
|
||||
LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt);
|
||||
|
||||
// Check if a lemma is in dictionary
|
||||
int32 locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len);
|
||||
|
||||
bool remove_lemma_by_offset_index(int offset_index);
|
||||
#ifdef ___PREDICT_ENABLED___
|
||||
uint32 locate_where_to_insert_in_predicts(const uint16 *words, int lemma_len);
|
||||
|
||||
int32 locate_first_in_predicts(const uint16 *words, int lemma_len);
|
||||
|
||||
void remove_lemma_from_predict_list(uint32 offset);
|
||||
#endif
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
void queue_lemma_for_sync(LemmaIdType id);
|
||||
|
||||
void remove_lemma_from_sync_list(uint32 offset);
|
||||
|
||||
void write_back_sync(int fd);
|
||||
#endif
|
||||
void write_back_score(int fd);
|
||||
void write_back_offset(int fd);
|
||||
void write_back_lemma(int fd);
|
||||
void write_back_all(int fd);
|
||||
void write_back();
|
||||
|
||||
struct UserDictScoreOffsetPair {
|
||||
int score;
|
||||
uint32 offset_index;
|
||||
};
|
||||
|
||||
inline void swap(UserDictScoreOffsetPair *sop, int i, int j);
|
||||
|
||||
void shift_down(UserDictScoreOffsetPair *sop, int i, int n);
|
||||
|
||||
// On-disk format for each lemma
|
||||
// +-------------+
|
||||
// | Version (4) |
|
||||
// +-------------+
|
||||
// +-----------+-----------+--------------------+-------------------+
|
||||
// | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
|
||||
// +-----------+-----------+--------------------+-------------------+
|
||||
// ...
|
||||
// +-----------------------+ +-------------+ <---Offset of offset
|
||||
// | Offset1 by_splids (4) | ... | OffsetN (4) |
|
||||
// +-----------------------+ +-------------+
|
||||
#ifdef ___PREDICT_ENABLED___
|
||||
// +----------------------+ +-------------+
|
||||
// | Offset1 by_lemma (4) | ... | OffsetN (4) |
|
||||
// +----------------------+ +-------------+
|
||||
#endif
|
||||
// +------------+ +------------+
|
||||
// | Score1 (4) | ... | ScoreN (4) |
|
||||
// +------------+ +------------+
|
||||
#ifdef ___SYNC_ENABLED___
|
||||
// +-------------+ +-------------+
|
||||
// | NewAdd1 (4) | ... | NewAddN (4) |
|
||||
// +-------------+ +-------------+
|
||||
#endif
|
||||
// +----------------+
|
||||
// | Dict Info (4x) |
|
||||
// +----------------+
|
||||
};
|
||||
} // namespace ime_pinyin
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,35 +1,35 @@
|
||||
#include "../src/include/pinyinime.h"
|
||||
#include <codecvt>
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
|
||||
std::string fromUtf16(const ime_pinyin::char16 *buf, size_t len) {
|
||||
// 转换为标准 char16_t
|
||||
std::u16string utf16Str(reinterpret_cast<const char16_t *>(buf), len);
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
|
||||
return convert.to_bytes(utf16Str);
|
||||
}
|
||||
|
||||
int main() {
|
||||
if (!ime_pinyin::im_open_decoder("./data/dict_pinyin.dat", "./data/user_dict.dat")) {
|
||||
std::cout << "fany bug.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string pinyin = "ni'ma'si'le";
|
||||
pinyin = "ni'ma'mei'si";
|
||||
pinyin = "ni'shuo'ni'ma'ne";
|
||||
size_t cand_cnt = ime_pinyin::im_search(pinyin.c_str(), pinyin.size());
|
||||
ime_pinyin::char16 buf[256] = {0};
|
||||
std::string msg;
|
||||
for (size_t i = 0; i < 100; ++i) {
|
||||
ime_pinyin::im_get_candidate(i, buf, 255);
|
||||
size_t len = 0;
|
||||
while (buf[len] != 0 && len < 255) ++len;
|
||||
msg.append(fromUtf16(buf, len) + " ");
|
||||
}
|
||||
std::cout << "候选项数量: " << cand_cnt << std::endl;
|
||||
std::cout << "候选项本体: " << msg << std::endl;
|
||||
return 0;
|
||||
}
|
||||
#include "../src/include/pinyinime.h"
|
||||
#include <codecvt>
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
|
||||
std::string fromUtf16(const ime_pinyin::char16 *buf, size_t len) {
|
||||
// 转换为标准 char16_t
|
||||
std::u16string utf16Str(reinterpret_cast<const char16_t *>(buf), len);
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
|
||||
return convert.to_bytes(utf16Str);
|
||||
}
|
||||
|
||||
int main() {
|
||||
if (!ime_pinyin::im_open_decoder("./data/dict_pinyin.dat", "./data/user_dict.dat")) {
|
||||
std::cout << "fany bug.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string pinyin = "ni'ma'si'le";
|
||||
pinyin = "ni'ma'mei'si";
|
||||
pinyin = "ni'shuo'ni'ma'ne";
|
||||
size_t cand_cnt = ime_pinyin::im_search(pinyin.c_str(), pinyin.size());
|
||||
ime_pinyin::char16 buf[256] = {0};
|
||||
std::string msg;
|
||||
for (size_t i = 0; i < 100; ++i) {
|
||||
ime_pinyin::im_get_candidate(i, buf, 255);
|
||||
size_t len = 0;
|
||||
while (buf[len] != 0 && len < 255) ++len;
|
||||
msg.append(fromUtf16(buf, len) + " ");
|
||||
}
|
||||
std::cout << "候选项数量: " << cand_cnt << std::endl;
|
||||
std::cout << "候选项本体: " << msg << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
Reference in New Issue
Block a user