matrixorigin · fengttt · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
@@ -1,6 +1,7 @@
 .idea/
 .kiro/
 .cursor/
+CLAUDE.md
 *.o
 *.a
 *.exe
@@ -33,7 +34,9 @@ main
 mo-tool
 /mo-server
 /mo-service
+/mo-service.log
 /mo-debug
+/bug.sql
 cube*/
 store/
 mo-data/

@@ -93,6 +93,7 @@ require (
 	github.com/tidwall/pretty v1.2.1
 	github.com/tmc/langchaingo v0.1.13
 	github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5
+	github.com/yanyiwu/gojieba v1.4.7
 	go.starlark.net v0.0.0-20250701195324-d457b4515e0e
 	go.uber.org/automaxprocs v1.5.3
 	go.uber.org/ratelimit v0.2.0

@@ -907,6 +907,8 @@ github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavM
 github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
 github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
 github.com/yalp/jsonpath v0.0.0-20180802001716-5cc68e5049a0/go.mod h1:/LWChgwKmvncFJFHJ7Gvn9wZArjbV5/FppcK2fKk/tI=
+github.com/yanyiwu/gojieba v1.4.7 h1:2YkXELcYLTE0SJetq6xv4MjpEikWga6VpFn4jIFFQ/k=
+github.com/yanyiwu/gojieba v1.4.7/go.mod h1:JUq4DddFVGdHXJHxxepxRmhrKlDpaBxR8O28v6fKYLY=
 github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg=
 github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82/go.mod h1:lgjkn3NuSvDfVJdfcVVdX+jpBxNmX4rDAzaS45IcYoM=
 github.com/yudai/pp v2.0.1+incompatible/go.mod h1:PuxR/8QJ7cyCkFp/aUDS+JY727OFEZkTdatxwunjIkc=

@@ -32,6 +32,8 @@ FROM matrixorigin/ubuntu:22.04
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/mo-service /mo-service
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/etc /etc
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/thirdparties/install/lib/*.so /usr/local/lib
+COPY --from=builder /go/src/github.com/matrixorigin/matrixone/pkg/monlp/tokenizer/dict /usr/local/share/jieba
+ENV MO_JIEBA_DICT_DIR=/usr/local/share/jieba
 
 # ldconfig and run mo-service to check if the shared library is found
 RUN ldconfig && /mo-service -h

@@ -52,8 +52,11 @@ FROM nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04
 COPY --from=builder /matrixone/mo-service /mo-service
 COPY --from=builder /matrixone/etc /etc
 COPY --from=builder /matrixone/thirdparties/install/lib/*.so /usr/local/lib
+COPY --from=builder /matrixone/pkg/monlp/tokenizer/dict /usr/local/share/jieba
 COPY --from=builder /root/miniconda/envs/go/lib /root/miniconda/envs/go/lib
 
+ENV MO_JIEBA_DICT_DIR=/usr/local/share/jieba
+
 ENV PATH="/usr/local/cuda/bin:${PATH}"
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH}"
 ENV CONDA_PREFIX=/root/miniconda/envs/go

@@ -201,7 +201,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
 	if def.IndexOption != nil {
 		parsername := strings.ToLower(def.IndexOption.ParserName)
 		if len(parsername) > 0 {
-			if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
+			if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" && parsername != "gojieba" {
 				return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
 			}
 			res["parser"] = parsername

@@ -15,6 +15,7 @@
 package fulltext
 
 import (
+	"encoding/json"
 	"fmt"
 	"math"
 	"strings"
@@ -47,7 +48,12 @@ import (
 // Init Search Accum
 func NewSearchAccum(srctbl string, tblname string, pattern string, mode int64, params string, scoreAlgo FullTextScoreAlgo) (*SearchAccum, error) {
 
-	ps, err := ParsePattern(pattern, mode)
+	parser, err := parserFromParams(params)
+	if err != nil {
+		return nil, err
+	}
+
+	ps, err := ParsePattern(pattern, mode, parser)
 	if err != nil {
 		return nil, err
 	}
@@ -57,6 +63,20 @@ func NewSearchAccum(srctbl string, tblname string, pattern string, mode int64, p
 		Pattern: ps, Params: params, Nkeywords: nwords, AnyPlus: hasPatternAnyPlus(ps), ScoreAlgo: scoreAlgo}, nil
 }
 
+// parserFromParams extracts the "parser" field from a JSON params string
+// (the same payload threaded through fulltext_index_scan / _tokenize).
+// An empty params string returns "".
+func parserFromParams(params string) (string, error) {
+	if len(params) == 0 {
+		return "", nil
+	}
+	var p FullTextParserParam
+	if err := json.Unmarshal([]byte(params), &p); err != nil {
+		return "", err
+	}
+	return p.Parser, nil
+}
+
 // find pattern by operator
 func findPatternByOperator(ps []*Pattern, op int) []*Pattern {
 	var result []*Pattern
@@ -822,8 +842,38 @@ func GetResultCountFromPattern(ps []*Pattern) int {
 	return cnt
 }
 
+// parsePatternInNLModeJieba tokenizes the search string with gojieba, emitting
+// one TEXT pattern per word. Unlike the ngram path there is no minimum-length
+// rewrite: short tokens map directly to TEXT lookups since the index stores
+// jieba-segmented words rather than overlapping bigrams.
+//
+// HMM is disabled here so the query and index sides segment the same way; with
+// HMM on the query side, traditional-Chinese bigrams (e.g. 教學, 中華) appear at
+// query time but not in the index, producing queries that can never match.
+func parsePatternInNLModeJieba(pattern string) ([]*Pattern, error) {
+	tok := tokenizer.SharedJiebaTokenizer(false)
+	list := make([]*Pattern, 0, 8)
+	for t := range tok.Tokenize([]byte(pattern)) {
+		slen := t.TokenBytes[0]
+		word := string(t.TokenBytes[1 : slen+1])
+		list = append(list, &Pattern{Text: word, Operator: TEXT, Position: t.BytePos})
+	}
+	if len(list) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("Invalid input search string.  search string onverted to empty pattern")
+	}
+	idx := int32(0)
+	for _, p := range list {
+		assignPatternIndex(p, &idx)
+	}
+	return list, nil
+}
+
 // Parse search string in natural language mode
-func ParsePatternInNLMode(pattern string) ([]*Pattern, error) {
+func ParsePatternInNLMode(pattern string, parser string) ([]*Pattern, error) {
+	if parser == "gojieba" {
+		return parsePatternInNLModeJieba(pattern)
+	}
+
 	runeSlice := []rune(pattern)
 	ngram_size := 3
 	// if number of character is small than Ngram size = 3, do prefix search
@@ -832,14 +882,14 @@ func ParsePatternInNLMode(pattern string) ([]*Pattern, error) {
 	}
 
 	list := make([]*Pattern, 0, 32)
-	tok, _ := tokenizer.NewSimpleTokenizer([]byte(pattern))
+	tok := tokenizer.NewSimpleTokenizer()
 
 	currBytePos := int32(0)
 	currEndBytePos := int32(0)
 
 	overlaps := make([]tokenizer.Token, 0, 8)
 
-	for t := range tok.Tokenize() {
+	for t := range tok.Tokenize([]byte(pattern)) {
 
 		slen := t.TokenBytes[0]
 		word := string(t.TokenBytes[1 : slen+1])
@@ -943,11 +993,11 @@ func PatternOptimizeJoin(ps []*Pattern) []*Pattern {
 }
 
 // Parse search string into list of patterns
-func ParsePattern(pattern string, mode int64) ([]*Pattern, error) {
+func ParsePattern(pattern string, mode int64, parser string) ([]*Pattern, error) {
 	switch mode {
 	case int64(tree.FULLTEXT_NL), int64(tree.FULLTEXT_DEFAULT):
 		// Natural Language Mode or default mode
-		ps, err := ParsePatternInNLMode(pattern)
+		ps, err := ParsePatternInNLMode(pattern, parser)
 		if err != nil {
 			return nil, err
 		}

@@ -0,0 +1,108 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fulltext
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func collectTexts(ps []*Pattern) []string {
+	out := make([]string, 0, len(ps))
+	for _, p := range ps {
+		out = append(out, p.Text)
+	}
+	return out
+}
+
+func TestParsePatternInNLModeJieba(t *testing.T) {
+	ps, err := ParsePatternInNLMode("我来到北京清华大学", "gojieba")
+	require.Nil(t, err)
+	assert.Equal(t, []string{"我", "来到", "北京", "清华大学"}, collectTexts(ps))
+	for _, p := range ps {
+		// jieba path emits TEXT patterns only (no STAR rewrites).
+		assert.Equal(t, TEXT, p.Operator)
+	}
+}
+
+func TestParsePatternInNLModeNgramUnchanged(t *testing.T) {
+	// Empty parser keeps the existing ngram behavior: short input → STAR prefix.
+	ps, err := ParsePatternInNLMode("hi", "")
+	require.Nil(t, err)
+	require.Len(t, ps, 1)
+	assert.Equal(t, "hi*", ps[0].Text)
+	assert.Equal(t, STAR, ps[0].Operator)
+}
+
+func TestParsePatternRoutesByParser(t *testing.T) {
+	// gojieba: "苹果香蕉" segments cleanly into two TEXT tokens.
+	ps, err := ParsePattern("苹果香蕉", int64(tree.FULLTEXT_NL), "gojieba")
+	require.Nil(t, err)
+	texts := collectTexts(ps)
+	assert.Equal(t, []string{"苹果", "香蕉"}, texts)
+
+	// ngram path on the same input would emit overlapping bigrams, which is
+	// markedly different from a 2-element exact word list.
+	ngramPs, err := ParsePattern("苹果香蕉", int64(tree.FULLTEXT_NL), "")
+	require.Nil(t, err)
+	assert.NotEqual(t, texts, collectTexts(ngramPs))
+}
+
+func TestNewSearchAccumExtractsParserFromParams(t *testing.T) {
+	s, err := NewSearchAccum("src", "idx", "我爱北京",
+		int64(tree.FULLTEXT_NL), `{"parser":"gojieba"}`, ALGO_TFIDF)
+	require.Nil(t, err)
+	require.Len(t, s.Pattern, 3)
+	assert.Equal(t, []string{"我", "爱", "北京"}, collectTexts(s.Pattern))
+}
+
+func TestParserFromParams(t *testing.T) {
+	cases := []struct {
+		params string
+		want   string
+	}{
+		{"", ""},
+		{`{"parser":"gojieba"}`, "gojieba"},
+		{`{"parser":"ngram"}`, "ngram"},
+		{`{"async":"true"}`, ""},
+	}
+	for _, c := range cases {
+		got, err := parserFromParams(c.params)
+		require.Nil(t, err, c.params)
+		assert.Equal(t, c.want, got, c.params)
+	}
+
+	_, err := parserFromParams("not json")
+	assert.NotNil(t, err)
+}
+
+func TestPatternToSqlGojiebaBoolean(t *testing.T) {
+	// Re-tokenization in boolean mode for a Chinese keyword must use jieba.
+	s, err := NewSearchAccum("src", "idx", "+清华大学",
+		int64(tree.FULLTEXT_BOOLEAN), `{"parser":"gojieba"}`, ALGO_TFIDF)
+	require.Nil(t, err)
+
+	sql, err := PatternToSql(s.Pattern, s.Mode, s.TblName, "gojieba", ALGO_TFIDF)
+	require.Nil(t, err)
+	// Jieba keeps "清华大学" as a single dictionary word, so the SQL must
+	// look up that exact word — not 3-char overlapping bigrams.
+	assert.Contains(t, sql, "word = '清华大学'")
+	assert.False(t, strings.Contains(sql, "word = '清华大'"),
+		"sql should not contain ngram bigram 清华大: %s", sql)
+}
@@ -246,7 +246,7 @@ func GenTextSql(p *Pattern, mode int64, idxtbl string, parser string) (string, e
 		return sql, nil
 	}
 
-	ps, err := ParsePatternInNLMode(p.Text)
+	ps, err := ParsePatternInNLMode(p.Text, parser)
 	if err != nil {
 		return "", err
 	}

@@ -182,7 +182,7 @@ func PatternListToString(ps []*Pattern) string {
 }
 
 func PatternToString(pattern string, mode int64) (string, error) {
-	ps, err := ParsePattern(pattern, mode)
+	ps, err := ParsePattern(pattern, mode, "")
 	if err != nil {
 		return "", err
 	}
@@ -200,7 +200,7 @@ func PatternListToStringWithPosition(ps []*Pattern) string {
 }
 
 func PatternToStringWithPosition(pattern string, mode int64) (string, error) {
-	ps, err := ParsePattern(pattern, mode)
+	ps, err := ParsePattern(pattern, mode, "")
 	if err != nil {
 		return "", err
 	}

@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,31 @@
+# CppJieba字典
+
+文件后缀名代表的是词典的编码方式。
+比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
+
+
+## 分词
+
+### jieba.dict.utf8/gbk
+
+作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
+
+### hmm_model.utf8/gbk
+
+作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
+
+__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
+
+
+## 关键词抽取
+
+### idf.utf8
+
+IDF(Inverse Document Frequency)
+在KeywordExtractor中，使用的是经典的TF-IDF算法，所以需要这么一个词典提供IDF信息。
+
+### stop_words.utf8
+
+停用词词典
+
+