Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.idea/
.kiro/
.cursor/
CLAUDE.md
*.o
*.a
*.exe
Expand Down Expand Up @@ -33,7 +34,9 @@ main
mo-tool
/mo-server
/mo-service
/mo-service.log
/mo-debug
/bug.sql
cube*/
store/
mo-data/
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ require (
github.com/tidwall/pretty v1.2.1
github.com/tmc/langchaingo v0.1.13
github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5
github.com/yanyiwu/gojieba v1.4.7
go.starlark.net v0.0.0-20250701195324-d457b4515e0e
go.uber.org/automaxprocs v1.5.3
go.uber.org/ratelimit v0.2.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,8 @@ github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavM
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
github.com/yalp/jsonpath v0.0.0-20180802001716-5cc68e5049a0/go.mod h1:/LWChgwKmvncFJFHJ7Gvn9wZArjbV5/FppcK2fKk/tI=
github.com/yanyiwu/gojieba v1.4.7 h1:2YkXELcYLTE0SJetq6xv4MjpEikWga6VpFn4jIFFQ/k=
github.com/yanyiwu/gojieba v1.4.7/go.mod h1:JUq4DddFVGdHXJHxxepxRmhrKlDpaBxR8O28v6fKYLY=
github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg=
github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82/go.mod h1:lgjkn3NuSvDfVJdfcVVdX+jpBxNmX4rDAzaS45IcYoM=
github.com/yudai/pp v2.0.1+incompatible/go.mod h1:PuxR/8QJ7cyCkFp/aUDS+JY727OFEZkTdatxwunjIkc=
Expand Down
2 changes: 2 additions & 0 deletions optools/images/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ FROM matrixorigin/ubuntu:22.04
COPY --from=builder /go/src/github.com/matrixorigin/matrixone/mo-service /mo-service
COPY --from=builder /go/src/github.com/matrixorigin/matrixone/etc /etc
COPY --from=builder /go/src/github.com/matrixorigin/matrixone/thirdparties/install/lib/*.so /usr/local/lib
COPY --from=builder /go/src/github.com/matrixorigin/matrixone/pkg/monlp/tokenizer/dict /usr/local/share/jieba
ENV MO_JIEBA_DICT_DIR=/usr/local/share/jieba

# ldconfig and run mo-service to check if the shared library is found
RUN ldconfig && /mo-service -h
Expand Down
3 changes: 3 additions & 0 deletions optools/images/gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,11 @@ FROM nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04
COPY --from=builder /matrixone/mo-service /mo-service
COPY --from=builder /matrixone/etc /etc
COPY --from=builder /matrixone/thirdparties/install/lib/*.so /usr/local/lib
COPY --from=builder /matrixone/pkg/monlp/tokenizer/dict /usr/local/share/jieba
COPY --from=builder /root/miniconda/envs/go/lib /root/miniconda/envs/go/lib

ENV MO_JIEBA_DICT_DIR=/usr/local/share/jieba

ENV PATH="/usr/local/cuda/bin:${PATH}"
ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH}"
ENV CONDA_PREFIX=/root/miniconda/envs/go
Expand Down
2 changes: 1 addition & 1 deletion pkg/catalog/secondary_index_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
if def.IndexOption != nil {
parsername := strings.ToLower(def.IndexOption.ParserName)
if len(parsername) > 0 {
if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" && parsername != "gojieba" {
return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
}
res["parser"] = parsername
Expand Down
62 changes: 56 additions & 6 deletions pkg/fulltext/fulltext.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package fulltext

import (
"encoding/json"
"fmt"
"math"
"strings"
Expand Down Expand Up @@ -47,7 +48,12 @@ import (
// Init Search Accum
func NewSearchAccum(srctbl string, tblname string, pattern string, mode int64, params string, scoreAlgo FullTextScoreAlgo) (*SearchAccum, error) {

ps, err := ParsePattern(pattern, mode)
parser, err := parserFromParams(params)
if err != nil {
return nil, err
}

ps, err := ParsePattern(pattern, mode, parser)
if err != nil {
return nil, err
}
Expand All @@ -57,6 +63,20 @@ func NewSearchAccum(srctbl string, tblname string, pattern string, mode int64, p
Pattern: ps, Params: params, Nkeywords: nwords, AnyPlus: hasPatternAnyPlus(ps), ScoreAlgo: scoreAlgo}, nil
}

// parserFromParams extracts the "parser" field from a JSON params string
// (the same payload threaded through fulltext_index_scan / _tokenize).
// An empty params string returns "".
func parserFromParams(params string) (string, error) {
if len(params) == 0 {
return "", nil
}
var p FullTextParserParam
if err := json.Unmarshal([]byte(params), &p); err != nil {
return "", err
}
return p.Parser, nil
}

// find pattern by operator
func findPatternByOperator(ps []*Pattern, op int) []*Pattern {
var result []*Pattern
Expand Down Expand Up @@ -822,8 +842,38 @@ func GetResultCountFromPattern(ps []*Pattern) int {
return cnt
}

// parsePatternInNLModeJieba tokenizes the search string with gojieba, emitting
// one TEXT pattern per word. Unlike the ngram path there is no minimum-length
// rewrite: short tokens map directly to TEXT lookups since the index stores
// jieba-segmented words rather than overlapping bigrams.
//
// HMM is disabled here so the query and index sides segment the same way; with
// HMM on the query side, traditional-Chinese bigrams (e.g. 教學, 中華) appear at
// query time but not in the index, producing queries that can never match.
func parsePatternInNLModeJieba(pattern string) ([]*Pattern, error) {
tok := tokenizer.SharedJiebaTokenizer(false)
list := make([]*Pattern, 0, 8)
for t := range tok.Tokenize([]byte(pattern)) {
slen := t.TokenBytes[0]
word := string(t.TokenBytes[1 : slen+1])
list = append(list, &Pattern{Text: word, Operator: TEXT, Position: t.BytePos})
}
if len(list) == 0 {
return nil, moerr.NewInternalErrorNoCtx("Invalid input search string. search string onverted to empty pattern")
}
idx := int32(0)
for _, p := range list {
assignPatternIndex(p, &idx)
}
return list, nil
}

// Parse search string in natural language mode
func ParsePatternInNLMode(pattern string) ([]*Pattern, error) {
func ParsePatternInNLMode(pattern string, parser string) ([]*Pattern, error) {
if parser == "gojieba" {
return parsePatternInNLModeJieba(pattern)
}

runeSlice := []rune(pattern)
ngram_size := 3
// if number of character is small than Ngram size = 3, do prefix search
Expand All @@ -832,14 +882,14 @@ func ParsePatternInNLMode(pattern string) ([]*Pattern, error) {
}

list := make([]*Pattern, 0, 32)
tok, _ := tokenizer.NewSimpleTokenizer([]byte(pattern))
tok := tokenizer.NewSimpleTokenizer()

currBytePos := int32(0)
currEndBytePos := int32(0)

overlaps := make([]tokenizer.Token, 0, 8)

for t := range tok.Tokenize() {
for t := range tok.Tokenize([]byte(pattern)) {

slen := t.TokenBytes[0]
word := string(t.TokenBytes[1 : slen+1])
Expand Down Expand Up @@ -943,11 +993,11 @@ func PatternOptimizeJoin(ps []*Pattern) []*Pattern {
}

// Parse search string into list of patterns
func ParsePattern(pattern string, mode int64) ([]*Pattern, error) {
func ParsePattern(pattern string, mode int64, parser string) ([]*Pattern, error) {
switch mode {
case int64(tree.FULLTEXT_NL), int64(tree.FULLTEXT_DEFAULT):
// Natural Language Mode or default mode
ps, err := ParsePatternInNLMode(pattern)
ps, err := ParsePatternInNLMode(pattern, parser)
if err != nil {
return nil, err
}
Expand Down
108 changes: 108 additions & 0 deletions pkg/fulltext/jieba_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Copyright 2024 Matrix Origin
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fulltext

import (
"strings"
"testing"

"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func collectTexts(ps []*Pattern) []string {
out := make([]string, 0, len(ps))
for _, p := range ps {
out = append(out, p.Text)
}
return out
}

func TestParsePatternInNLModeJieba(t *testing.T) {
ps, err := ParsePatternInNLMode("我来到北京清华大学", "gojieba")
require.Nil(t, err)
assert.Equal(t, []string{"我", "来到", "北京", "清华大学"}, collectTexts(ps))
for _, p := range ps {
// jieba path emits TEXT patterns only (no STAR rewrites).
assert.Equal(t, TEXT, p.Operator)
}
}

func TestParsePatternInNLModeNgramUnchanged(t *testing.T) {
// Empty parser keeps the existing ngram behavior: short input → STAR prefix.
ps, err := ParsePatternInNLMode("hi", "")
require.Nil(t, err)
require.Len(t, ps, 1)
assert.Equal(t, "hi*", ps[0].Text)
assert.Equal(t, STAR, ps[0].Operator)
}

func TestParsePatternRoutesByParser(t *testing.T) {
// gojieba: "苹果香蕉" segments cleanly into two TEXT tokens.
ps, err := ParsePattern("苹果香蕉", int64(tree.FULLTEXT_NL), "gojieba")
require.Nil(t, err)
texts := collectTexts(ps)
assert.Equal(t, []string{"苹果", "香蕉"}, texts)

// ngram path on the same input would emit overlapping bigrams, which is
// markedly different from a 2-element exact word list.
ngramPs, err := ParsePattern("苹果香蕉", int64(tree.FULLTEXT_NL), "")
require.Nil(t, err)
assert.NotEqual(t, texts, collectTexts(ngramPs))
}

func TestNewSearchAccumExtractsParserFromParams(t *testing.T) {
s, err := NewSearchAccum("src", "idx", "我爱北京",
int64(tree.FULLTEXT_NL), `{"parser":"gojieba"}`, ALGO_TFIDF)
require.Nil(t, err)
require.Len(t, s.Pattern, 3)
assert.Equal(t, []string{"我", "爱", "北京"}, collectTexts(s.Pattern))
}

func TestParserFromParams(t *testing.T) {
cases := []struct {
params string
want string
}{
{"", ""},
{`{"parser":"gojieba"}`, "gojieba"},
{`{"parser":"ngram"}`, "ngram"},
{`{"async":"true"}`, ""},
}
for _, c := range cases {
got, err := parserFromParams(c.params)
require.Nil(t, err, c.params)
assert.Equal(t, c.want, got, c.params)
}

_, err := parserFromParams("not json")
assert.NotNil(t, err)
}

func TestPatternToSqlGojiebaBoolean(t *testing.T) {
// Re-tokenization in boolean mode for a Chinese keyword must use jieba.
s, err := NewSearchAccum("src", "idx", "+清华大学",
int64(tree.FULLTEXT_BOOLEAN), `{"parser":"gojieba"}`, ALGO_TFIDF)
require.Nil(t, err)

sql, err := PatternToSql(s.Pattern, s.Mode, s.TblName, "gojieba", ALGO_TFIDF)
require.Nil(t, err)
// Jieba keeps "清华大学" as a single dictionary word, so the SQL must
// look up that exact word — not 3-char overlapping bigrams.
assert.Contains(t, sql, "word = '清华大学'")
assert.False(t, strings.Contains(sql, "word = '清华大'"),
"sql should not contain ngram bigram 清华大: %s", sql)
}
2 changes: 1 addition & 1 deletion pkg/fulltext/sql.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ func GenTextSql(p *Pattern, mode int64, idxtbl string, parser string) (string, e
return sql, nil
}

ps, err := ParsePatternInNLMode(p.Text)
ps, err := ParsePatternInNLMode(p.Text, parser)
if err != nil {
return "", err
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/fulltext/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func PatternListToString(ps []*Pattern) string {
}

func PatternToString(pattern string, mode int64) (string, error) {
ps, err := ParsePattern(pattern, mode)
ps, err := ParsePattern(pattern, mode, "")
if err != nil {
return "", err
}
Expand All @@ -200,7 +200,7 @@ func PatternListToStringWithPosition(ps []*Pattern) string {
}

func PatternToStringWithPosition(pattern string, mode int64) (string, error) {
ps, err := ParsePattern(pattern, mode)
ps, err := ParsePattern(pattern, mode, "")
if err != nil {
return "", err
}
Expand Down
20 changes: 20 additions & 0 deletions pkg/monlp/tokenizer/dict/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
The MIT License (MIT)

Copyright (c) 2013

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 changes: 31 additions & 0 deletions pkg/monlp/tokenizer/dict/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# CppJieba字典

文件后缀名代表的是词典的编码方式。
比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。


## 分词

### jieba.dict.utf8/gbk

作为最大概率法(MPSegment: Max Probability)分词所使用的词典。

### hmm_model.utf8/gbk

作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。

__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__


## 关键词抽取

### idf.utf8

IDF(Inverse Document Frequency)
在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。

### stop_words.utf8

停用词词典


34 changes: 34 additions & 0 deletions pkg/monlp/tokenizer/dict/hmm_model.utf8

Large diffs are not rendered by default.

Loading
Loading