这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions analysis/analyzer/analyzer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
package analyzer

import (
"fmt"

"github.com/blugelabs/bluge/analysis"
"github.com/blugelabs/bluge/analysis/char"
"github.com/blugelabs/bluge/analysis/token"
"github.com/blugelabs/bluge/analysis/tokenizer"
phalanxchar "github.com/mosuka/phalanx/analysis/char"
phalanxtoken "github.com/mosuka/phalanx/analysis/token"
phalanxtokenizer "github.com/mosuka/phalanx/analysis/tokenizer"
)

type AnalyzerSetting struct {
CharFilterSettings []phalanxchar.CharFilterSetting `json:"char_filters"`
TokenizerSetting phalanxtokenizer.TokenizerSetting `json:"tokenizer"`
TokenFilterSettings []phalanxtoken.TokenFilterSetting `json:"token_filters"`
}

func NewAnalyzer(config AnalyzerSetting) (*analysis.Analyzer, error) {
var err error

// Char filter.
charFilters := make([]analysis.CharFilter, 0)
charFilterSettings := config.CharFilterSettings
for _, charFilterSetting := range charFilterSettings {
switch charFilterSetting.Name {
case phalanxchar.AsciiFoldingCharFilter:
charFilter := char.NewASCIIFoldingFilter()
charFilters = append(charFilters, charFilter)
case phalanxchar.HtmlCharFilter:
charFilter := char.NewHTMLCharFilter()
charFilters = append(charFilters, charFilter)
case phalanxchar.RegexpCharFilter:
charFilter, err := phalanxchar.NewRegexpCharFilterWithOptions(charFilterSetting.Options)
if err != nil {
return nil, err
}
charFilters = append(charFilters, charFilter)
case phalanxchar.UnicodeNormalizeCharFilter:
charFilter, err := phalanxchar.NewUnicodeNormalizeCharFilterWithOptions(charFilterSetting.Options)
if err != nil {
return nil, err
}
charFilters = append(charFilters, charFilter)
case phalanxchar.ZeroWidthNonJoinerCharFilter:
charFilter := char.NewZeroWidthNonJoinerCharFilter()
charFilters = append(charFilters, charFilter)
default:
return nil, fmt.Errorf("unknown char filter: %s", charFilterSetting.Name)
}
}

// Token filter.
tokenFilters := make([]analysis.TokenFilter, 0)
tokenFilterSettings := config.TokenFilterSettings
for _, tokenFilterSetting := range tokenFilterSettings {
switch tokenFilterSetting.Name {
case phalanxtoken.ApostropheTokenFilter:
tokenFilter := token.NewApostropheFilter()
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.CamelCaseTokenFilter:
tokenFilter := token.NewCamelCaseFilter()
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.DictionaryCompoundTokenFilter:
tokenFilter, err := phalanxtoken.NewDictionaryCompoundFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.EdgeNgramTokenFilter:
tokenFilter, err := phalanxtoken.NewEdgeNgramFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.ElisionTokenFilter:
tokenFilter, err := phalanxtoken.NewElisionFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.KeywordMarkerTokenFilter:
tokenFilter, err := phalanxtoken.NewKeyWordMarkerFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.LengthTokenFilter:
tokenFilter, err := phalanxtoken.NewLengthFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.LowerCaseTokenFilter:
tokenFilter := token.NewLowerCaseFilter()
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.NgramTokenFilter:
tokenFilter, err := phalanxtoken.NewNgramFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.PorterStemmerTokenFilter:
tokenFilter := token.NewPorterStemmer()
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.ReverseTokenFilter:
tokenFilter := token.NewReverseFilter()
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.ShingleTokenFilter:
tokenFilter, err := phalanxtoken.NewShingleFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.StopTokensTokenFilter:
tokenFilter, err := phalanxtoken.NewStopTokensFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.TruncateTokenFilter:
tokenFilter, err := phalanxtoken.NewTruncateTokenFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.UnicodeNormalizeTokenFilter:
tokenFilter, err := phalanxtoken.NewUnicodeNormalizeFilterWithOptions(tokenFilterSetting.Options)
if err != nil {
return nil, err
}
tokenFilters = append(tokenFilters, tokenFilter)
case phalanxtoken.UniqueTermTokenFilter:
tokenFilter := token.NewUniqueTermFilter()
tokenFilters = append(tokenFilters, tokenFilter)
default:
err := fmt.Errorf("unknown token filter: %v", tokenFilterSetting.Name)
return nil, err
}
}

// Tokenizer.
tokenizerSetting := config.TokenizerSetting
var fieldTokenizer analysis.Tokenizer
switch tokenizerSetting.Name {
case phalanxtokenizer.CharacterTokenizer:
fieldTokenizer, err = phalanxtokenizer.NewCharacterTokenizerWithOptions(tokenizerSetting.Options)
if err != nil {
return nil, err
}
case phalanxtokenizer.ExceptionTokenizer:
var err error
fieldTokenizer, err = phalanxtokenizer.NewExceptionsTokenizerWithOptions(tokenizerSetting.Options)
if err != nil {
return nil, err
}
case phalanxtokenizer.KagomeTokenizer:
var err error
fieldTokenizer, err = phalanxtokenizer.NewKagomeTokenizerWithOptions(tokenizerSetting.Options)
if err != nil {
return nil, err
}
case phalanxtokenizer.LetterTokenizer:
fieldTokenizer = tokenizer.NewLetterTokenizer()
case phalanxtokenizer.RegexpTokenizer:
var err error
fieldTokenizer, err = phalanxtokenizer.NewRegexpTokenizerWithOptions(tokenizerSetting.Options)
if err != nil {
return nil, err
}
case phalanxtokenizer.SingleTokenTokenizer:
fieldTokenizer = tokenizer.NewSingleTokenTokenizer()
case phalanxtokenizer.UnicodeTokenizer:
fieldTokenizer = tokenizer.NewUnicodeTokenizer()
case phalanxtokenizer.WebTokenizer:
fieldTokenizer = tokenizer.NewWebTokenizer()
case phalanxtokenizer.WhitespaceTokenizer:
fieldTokenizer = tokenizer.NewWhitespaceTokenizer()
default:
return nil, fmt.Errorf("unknown tokenizer: %s", tokenizerSetting.Name)
}

return &analysis.Analyzer{
CharFilters: charFilters,
Tokenizer: fieldTokenizer,
TokenFilters: tokenFilters,
}, nil
}
16 changes: 16 additions & 0 deletions analysis/char/char.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package char

type CharFilter string

const (
AsciiFoldingCharFilter CharFilter = "ascii_folding"
HtmlCharFilter CharFilter = "html"
RegexpCharFilter CharFilter = "regexp"
UnicodeNormalizeCharFilter CharFilter = "unicode_normalize"
ZeroWidthNonJoinerCharFilter CharFilter = "zero_width_non_joiner"
)

type CharFilterSetting struct {
Name CharFilter `json:"name"`
Options map[string]interface{} `json:"options"`
}
38 changes: 38 additions & 0 deletions analysis/char/regexp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package char

import (
"fmt"
"regexp"

"github.com/blugelabs/bluge/analysis/char"
)

// Create new RegexpCharFilter with given options.
// Options example:
// {
// "pattern": "foo",
// "replacement": "var"
// }
func NewRegexpCharFilterWithOptions(opts map[string]interface{}) (*char.RegexpCharFilter, error) {
patternValue, ok := opts["pattern"]
if !ok {
return nil, fmt.Errorf("pattern option does not exist")
}
pattern, ok := patternValue.(string)
if !ok {
return nil, fmt.Errorf("form option is unexpected")
}

replacementValue, ok := opts["replacement"]
if !ok {
return nil, fmt.Errorf("pattern option does not exist")
}
replacement, ok := replacementValue.(string)
if !ok {
return nil, fmt.Errorf("form option is unexpected")
}

charFilter := char.NewRegexpCharFilter(regexp.MustCompile(pattern), []byte(replacement))

return charFilter, nil
}
37 changes: 37 additions & 0 deletions analysis/char/unicodenorm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package char

import (
"fmt"

"github.com/blugelabs/bluge/analysis"
"github.com/ikawaha/blugeplugin/analysis/lang/ja"
"golang.org/x/text/unicode/norm"
)

// Create new UnicodeNormalizeCharFilter with given options.
// Options example:
// {
// "form": "NFKC"
// }
func NewUnicodeNormalizeCharFilterWithOptions(opts map[string]interface{}) (analysis.CharFilter, error) {
formValue, ok := opts["form"]
if !ok {
return nil, fmt.Errorf("form option does not exist")
}
form, ok := formValue.(string)
if !ok {
return nil, fmt.Errorf("form option is unexpected: %v", formValue)
}
switch form {
case "NFC":
return ja.NewUnicodeNormalizeCharFilter(norm.NFC), nil
case "NFD":
return ja.NewUnicodeNormalizeCharFilter(norm.NFD), nil
case "NFKC":
return ja.NewUnicodeNormalizeCharFilter(norm.NFKC), nil
case "NFKD":
return ja.NewUnicodeNormalizeCharFilter(norm.NFKD), nil
default:
return nil, fmt.Errorf("unknown form option: %v", form)
}
}
81 changes: 81 additions & 0 deletions analysis/token/dict.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package token

import (
"fmt"

"github.com/blugelabs/bluge/analysis"
"github.com/blugelabs/bluge/analysis/token"
)

// Create new DictionaryCompoundFilter with given options.
// Options example:
// {
// "words": [
// "soft",
// "softest",
// "ball"
// ],
// "min_word_size": 5,
// "min_sub_word_size": 2,
// "max_sub_word_size": 15,
// "only_longest_match": false
// }
func NewDictionaryCompoundFilterWithOptions(opts map[string]interface{}) (*token.DictionaryCompoundFilter, error) {
wordsValue, ok := opts["words"]
if !ok {
return nil, fmt.Errorf("words option does not exist")
}
words, ok := wordsValue.([]interface{})
if !ok {
return nil, fmt.Errorf("words option is unexpected")
}
wordMap := analysis.NewTokenMap()
for _, word := range words {
str, ok := word.(string)
if !ok {
return nil, fmt.Errorf("word is unexpected")
}
wordMap.AddToken(str)
}

minWordSizeValue, ok := opts["min_word_size"]
if !ok {
return nil, fmt.Errorf("min_word_size option does not exist")
}
minWordSizeNum, ok := minWordSizeValue.(float64)
if !ok {
return nil, fmt.Errorf("min_word_size option is unexpected")
}
minWordSize := int(minWordSizeNum)

minSubWordSizeValue, ok := opts["min_sub_word_size"]
if !ok {
return nil, fmt.Errorf("min_sub_word_size option does not exist")
}
minSubWordSizeNum, ok := minSubWordSizeValue.(float64)
if !ok {
return nil, fmt.Errorf("min_sub_word_size option is unexpected")
}
minSubWordSize := int(minSubWordSizeNum)

maxSubWordSizeValue, ok := opts["max_sub_word_size"]
if !ok {
return nil, fmt.Errorf("max_sub_word_size option does not exist")
}
maxSubWordSizeNum, ok := maxSubWordSizeValue.(float64)
if !ok {
return nil, fmt.Errorf("max_sub_word_size option is unexpected")
}
maxSubWordSize := int(maxSubWordSizeNum)

onlyLongestMatchValue, ok := opts["only_longest_match"]
if !ok {
return nil, fmt.Errorf("only_longest_match option does not exist")
}
onlyLongestMatch, ok := onlyLongestMatchValue.(bool)
if !ok {
return nil, fmt.Errorf("only_longest_match option is unexpected")
}

return token.NewDictionaryCompoundFilter(wordMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
}
Loading