init

2019-04-22 02:59:20 +00:00
commit beccf3fe43
25440 changed files with 4054998 additions and 0 deletions
--- a/vendor/github.com/go-ego/gse/segmenter.go
+++ b/vendor/github.com/go-ego/gse/segmenter.go
@ -0,0 +1,530 @@
+// Copyright 2013 Hui Chen
+// Copyright 2016 ego authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"): you may
+// not use this file except in compliance with the License. You may obtain
+// a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+
+/*
+
+package gse Go efficient text segmentation, Go 语言分词
+*/
+
+package gse
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"os"
+	"path"
+	"runtime"
+	"strconv"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+const (
+	version string = "v0.10.0.106, Danube River!"
+
+	minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
+)
+
+// GetVersion get the gse version
+func GetVersion() string {
+	return version
+}
+
+// Segmenter 分词器结构体
+type Segmenter struct {
+	dict *Dictionary
+}
+
+// jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息
+type jumper struct {
+	minDistance float32
+	token       *Token
+}
+
+// Dictionary 返回分词器使用的词典
+func (seg *Segmenter) Dictionary() *Dictionary {
+	return seg.dict
+}
+
+// getCurrentFilePath get current file path
+func getCurrentFilePath() string {
+	_, filePath, _, _ := runtime.Caller(1)
+	return filePath
+}
+
+// Read read the dict flie
+func (seg *Segmenter) Read(file string) error {
+	log.Printf("Load the gse dictionary: \"%s\" ", file)
+	dictFile, err := os.Open(file)
+	if err != nil {
+		log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
+		return err
+	}
+	defer dictFile.Close()
+
+	reader := bufio.NewReader(dictFile)
+	var (
+		text      string
+		freqText  string
+		frequency int
+		pos       string
+	)
+
+	// 逐行读入分词
+	line := 0
+	for {
+		line++
+		size, fsErr := fmt.Fscanln(reader, &text, &freqText, &pos)
+		if fsErr != nil {
+			if fsErr == io.EOF {
+				// End of file
+				break
+			}
+
+			if size > 0 {
+				log.Printf("File '%v' line \"%v\" read error: %v, skip",
+					file, line, fsErr.Error())
+			} else {
+				log.Printf("File '%v' line \"%v\" is empty, read error: %v, skip",
+					file, line, fsErr.Error())
+			}
+		}
+
+		if size == 0 {
+			// 文件结束或错误行
+			// break
+			continue
+		} else if size < 2 {
+			// 无效行
+			continue
+		} else if size == 2 {
+			// 没有词性标注时设为空字符串
+			pos = ""
+		}
+
+		// 解析词频
+		var err error
+		frequency, err = strconv.Atoi(freqText)
+		if err != nil {
+			continue
+		}
+
+		// 过滤频率太小的词
+		if frequency < minTokenFrequency {
+			continue
+		}
+		// 过滤, 降低词频
+		if len([]rune(text)) < 2 {
+			// continue
+			frequency = 2
+		}
+
+		// 将分词添加到字典中
+		words := splitTextToWords([]byte(text))
+		token := Token{text: words, frequency: frequency, pos: pos}
+		seg.dict.addToken(token)
+	}
+
+	return nil
+}
+
+// DictPaths get the dict's paths
+func DictPaths(dictDir, filePath string) (files []string) {
+	var dictPath string
+
+	if filePath == "en" {
+		return
+	}
+
+	if filePath == "zh" {
+		dictPath = path.Join(dictDir, "dict/dictionary.txt")
+		files = []string{dictPath}
+
+		return
+	}
+
+	if filePath == "jp" {
+		dictPath = path.Join(dictDir, "dict/jp/dict.txt")
+		files = []string{dictPath}
+
+		return
+	}
+
+	// if strings.Contains(filePath, ",") {
+	fileName := strings.Split(filePath, ",")
+	for i := 0; i < len(fileName); i++ {
+		if fileName[i] == "jp" {
+			dictPath = path.Join(dictDir, "dict/jp/dict.txt")
+		}
+
+		if fileName[i] == "zh" {
+			dictPath = path.Join(dictDir, "dict/dictionary.txt")
+		}
+
+		// if str[i] == "ti" {
+		// }
+
+		dictName := fileName[i] != "en" && fileName[i] != "zh" &&
+			fileName[i] != "jp" && fileName[i] != "ti"
+
+		if dictName {
+			dictPath = fileName[i]
+		}
+
+		if dictPath != "" {
+			files = append(files, dictPath)
+		}
+	}
+	// }
+	log.Println("Dict files path: ", files)
+
+	return
+}
+
+// IsJp is jp char return true
+func IsJp(segText string) bool {
+	for _, r := range segText {
+		jp := unicode.Is(unicode.Scripts["Hiragana"], r) ||
+			unicode.Is(unicode.Scripts["Katakana"], r)
+		if jp {
+			return true
+		}
+	}
+	return false
+}
+
+// SegToken add segmenter token
+func (seg *Segmenter) SegToken() {
+	// 计算每个分词的路径值，路径值含义见 Token 结构体的注释
+	logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
+	for i := range seg.dict.tokens {
+		token := &seg.dict.tokens[i]
+		token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
+	}
+
+	// 对每个分词进行细致划分，用于搜索引擎模式，
+	// 该模式用法见 Token 结构体的注释。
+	for i := range seg.dict.tokens {
+		token := &seg.dict.tokens[i]
+		segments := seg.segmentWords(token.text, true)
+
+		// 计算需要添加的子分词数目
+		numTokensToAdd := 0
+		for iToken := 0; iToken < len(segments); iToken++ {
+			// if len(segments[iToken].token.text) > 1 {
+			// 略去字元长度为一的分词
+			// TODO: 这值得进一步推敲，特别是当字典中有英文复合词的时候
+			if len(segments[iToken].token.text) > 0 {
+				hasJp := false
+				if len(segments[iToken].token.text) == 1 {
+					segText := string(segments[iToken].token.text[0])
+					hasJp = IsJp(segText)
+				}
+
+				if !hasJp {
+					numTokensToAdd++
+				}
+			}
+		}
+		token.segments = make([]*Segment, numTokensToAdd)
+
+		// 添加子分词
+		iSegmentsToAdd := 0
+		for iToken := 0; iToken < len(segments); iToken++ {
+			// if len(segments[iToken].token.text) > 1 {
+			if len(segments[iToken].token.text) > 0 {
+				hasJp := false
+				if len(segments[iToken].token.text) == 1 {
+					segText := string(segments[iToken].token.text[0])
+					hasJp = IsJp(segText)
+				}
+
+				if !hasJp {
+					token.segments[iSegmentsToAdd] = &segments[iToken]
+					iSegmentsToAdd++
+				}
+			}
+		}
+	}
+
+}
+
+// LoadDict load the dictionary from the file
+//
+// The format of the dictionary is (one for each participle):
+//	participle text, frequency, part of speech
+//
+// Can load multiple dictionary files, the file name separated by ","
+// the front of the dictionary preferentially load the participle,
+//	such as: "user_dictionary.txt,common_dictionary.txt"
+// When a participle appears both in the user dictionary and
+// in the `common dictionary`, the `user dictionary` is given priority.
+//
+// 从文件中载入词典
+//
+// 可以载入多个词典文件，文件名用 "," 分隔，排在前面的词典优先载入分词，比如:
+// 	"用户词典.txt,通用词典.txt"
+// 当一个分词既出现在用户词典也出现在 `通用词典` 中，则优先使用 `用户词典`。
+//
+// 词典的格式为（每个分词一行）：
+//	分词文本 频率 词性
+func (seg *Segmenter) LoadDict(files ...string) error {
+	seg.dict = NewDict()
+
+	var (
+		dictDir  = path.Join(path.Dir(getCurrentFilePath()), "data")
+		dictPath string
+		// load     bool
+	)
+
+	if len(files) > 0 {
+		dictFiles := DictPaths(dictDir, files[0])
+		if len(dictFiles) > 0 {
+			// load = true
+			// files = dictFiles
+			for i := 0; i < len(dictFiles); i++ {
+				err := seg.Read(dictFiles[i])
+				if err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	if len(files) == 0 {
+		dictPath = path.Join(dictDir, "dict/dictionary.txt")
+		// files = []string{dictPath}
+		err := seg.Read(dictPath)
+		if err != nil {
+			return err
+		}
+	}
+
+	// if files[0] != "" && files[0] != "en" && !load {
+	// 	for _, file := range strings.Split(files[0], ",") {
+	// 		// for _, file := range files {
+	// 		err := seg.Read(file)
+	// 		if err != nil {
+	// 			return err
+	// 		}
+	// 	}
+	// }
+
+	seg.SegToken()
+	log.Println("Gse dictionary loaded finished.")
+
+	return nil
+}
+
+// Segment 对文本分词
+//
+// 输入参数：
+//	bytes	UTF8 文本的字节数组
+//
+// 输出：
+//	[]Segment	划分的分词
+func (seg *Segmenter) Segment(bytes []byte) []Segment {
+	return seg.internalSegment(bytes, false)
+}
+
+// ModeSegment segment using search mode if searchMode is true
+func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
+	var mode bool
+	if len(searchMode) > 0 {
+		mode = searchMode[0]
+	}
+
+	return seg.internalSegment(bytes, mode)
+}
+
+// Slice use modeSegment segment retrun []string
+// using search mode if searchMode is true
+func (seg *Segmenter) Slice(bytes []byte, searchMode ...bool) []string {
+	segs := seg.ModeSegment(bytes, searchMode...)
+	return ToSlice(segs, searchMode...)
+}
+
+// Slice use modeSegment segment retrun string
+// using search mode if searchMode is true
+func (seg *Segmenter) String(bytes []byte, searchMode ...bool) string {
+	segs := seg.ModeSegment(bytes, searchMode...)
+	return ToString(segs, searchMode...)
+}
+
+func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
+	// 处理特殊情况
+	if len(bytes) == 0 {
+		// return []Segment{}
+		return nil
+	}
+
+	// 划分字元
+	text := splitTextToWords(bytes)
+
+	return seg.segmentWords(text, searchMode)
+}
+
+func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
+	// 搜索模式下该分词已无继续划分可能的情况
+	if searchMode && len(text) == 1 {
+		return nil
+	}
+
+	// jumpers 定义了每个字元处的向前跳转信息，
+	// 包括这个跳转对应的分词，
+	// 以及从文本段开始到该字元的最短路径值
+	jumpers := make([]jumper, len(text))
+
+	if seg.dict == nil {
+		return nil
+	}
+
+	tokens := make([]*Token, seg.dict.maxTokenLen)
+	for current := 0; current < len(text); current++ {
+		// 找到前一个字元处的最短路径，以便计算后续路径值
+		var baseDistance float32
+		if current == 0 {
+			// 当本字元在文本首部时，基础距离应该是零
+			baseDistance = 0
+		} else {
+			baseDistance = jumpers[current-1].minDistance
+		}
+
+		// 寻找所有以当前字元开头的分词
+		numTokens := seg.dict.lookupTokens(
+			text[current:minInt(current+seg.dict.maxTokenLen, len(text))], tokens)
+
+		// 对所有可能的分词，更新分词结束字元处的跳转信息
+		for iToken := 0; iToken < numTokens; iToken++ {
+			location := current + len(tokens[iToken].text) - 1
+			if !searchMode || current != 0 || location != len(text)-1 {
+				updateJumper(&jumpers[location], baseDistance, tokens[iToken])
+			}
+		}
+
+		// 当前字元没有对应分词时补加一个伪分词
+		if numTokens == 0 || len(tokens[0].text) > 1 {
+			updateJumper(&jumpers[current], baseDistance,
+				&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
+		}
+	}
+
+	// 从后向前扫描第一遍得到需要添加的分词数目
+	numSeg := 0
+	for index := len(text) - 1; index >= 0; {
+		location := index - len(jumpers[index].token.text) + 1
+		numSeg++
+		index = location - 1
+	}
+
+	// 从后向前扫描第二遍添加分词到最终结果
+	outputSegments := make([]Segment, numSeg)
+	for index := len(text) - 1; index >= 0; {
+		location := index - len(jumpers[index].token.text) + 1
+		numSeg--
+		outputSegments[numSeg].token = jumpers[index].token
+		index = location - 1
+	}
+
+	// 计算各个分词的字节位置
+	bytePosition := 0
+	for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
+		outputSegments[iSeg].start = bytePosition
+		bytePosition += textSliceByteLen(outputSegments[iSeg].token.text)
+		outputSegments[iSeg].end = bytePosition
+	}
+	return outputSegments
+}
+
+// updateJumper 更新跳转信息:
+// 	1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况)，或者
+//	2. 当该位置的当前最短路径大于新的最短路径时
+// 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率
+func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
+	newDistance := baseDistance + token.distance
+	if jumper.minDistance == 0 || jumper.minDistance > newDistance {
+		jumper.minDistance = newDistance
+		jumper.token = token
+	}
+}
+
+// minInt 取两整数较小值
+func minInt(a, b int) int {
+	if a > b {
+		return b
+	}
+	return a
+}
+
+// maxInt 取两整数较大值
+func maxInt(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// splitTextToWords 将文本划分成字元
+func splitTextToWords(text Text) []Text {
+	output := make([]Text, 0, len(text)/3)
+	current := 0
+	inAlphanumeric := true
+	alphanumericStart := 0
+	for current < len(text) {
+		r, size := utf8.DecodeRune(text[current:])
+		if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
+			// 当前是拉丁字母或数字（非中日韩文字）
+			if !inAlphanumeric {
+				alphanumericStart = current
+				inAlphanumeric = true
+			}
+		} else {
+			if inAlphanumeric {
+				inAlphanumeric = false
+				if current != 0 {
+					output = append(output, toLower(text[alphanumericStart:current]))
+				}
+			}
+			output = append(output, text[current:current+size])
+		}
+		current += size
+	}
+
+	// 处理最后一个字元是英文的情况
+	if inAlphanumeric {
+		if current != 0 {
+			output = append(output, toLower(text[alphanumericStart:current]))
+		}
+	}
+
+	return output
+}
+
+// toLower 将英文词转化为小写
+func toLower(text []byte) []byte {
+	output := make([]byte, len(text))
+	for i, t := range text {
+		if t >= 'A' && t <= 'Z' {
+			output[i] = t - 'A' + 'a'
+		} else {
+			output[i] = t
+		}
+	}
+	return output
+}