1
0
Fork 0
mirror of https://github.com/anyproto/anytype-heart.git synced 2025-06-11 02:13:41 +09:00

Merge pull request #1373 from anytypeio/fix-ftsearch

ft search fixes
This commit is contained in:
Sergey Cherepanov 2022-02-17 18:11:34 +03:00 committed by GitHub
commit fef51f1aa5
Signed by: github
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 143 additions and 131 deletions

View file

@ -39,7 +39,7 @@ const (
CName = "indexer"
// increasing counters below will trigger existing account to reindex their data
ForceThreadsObjectsReindexCounter int32 = 5 // reindex thread-based objects
ForceThreadsObjectsReindexCounter int32 = 6 // reindex thread-based objects
ForceFilesReindexCounter int32 = 5 // reindex ipfs-file-based objects
ForceBundledObjectsReindexCounter int32 = 3 // reindex objects like anytypeProfile
ForceIdxRebuildCounter int32 = 12 // erases localstore indexes and reindex all type of objects (no need to increase ForceThreadsObjectsReindexCounter & ForceFilesReindexCounter)
@ -49,7 +49,8 @@ const (
var log = logging.Logger("anytype-doc-indexer")
var (
ftIndexInterval = 10 * time.Second
ftIndexInterval = time.Minute
ftIndexForceMinInterval = time.Second * 10
)
func New() Indexer {
@ -57,6 +58,7 @@ func New() Indexer {
}
type Indexer interface {
ForceFTIndex()
app.ComponentRunnable
}
@ -96,6 +98,7 @@ type indexer struct {
archivedMap map[string]struct{}
favoriteMap map[string]struct{}
newAccount bool
forceFt chan struct{}
}
func (i *indexer) Init(a *app.App) (err error) {
@ -112,7 +115,7 @@ func (i *indexer) Init(a *app.App) (err error) {
i.quit = make(chan struct{})
i.archivedMap = make(map[string]struct{}, 100)
i.favoriteMap = make(map[string]struct{}, 100)
i.forceFt = make(chan struct{})
return
}
@ -165,6 +168,13 @@ func (i *indexer) Run() (err error) {
return
}
func (i *indexer) ForceFTIndex() {
select {
case i.forceFt <- struct{}{}:
default:
}
}
func (i *indexer) migrateRemoveNonindexableObjects() {
ids, err := i.getIdsForTypes(
smartblock.SmartblockTypeMarketplaceType, smartblock.SmartblockTypeMarketplaceRelation,
@ -728,7 +738,7 @@ func (i *indexer) index(ctx context.Context, info doc.DocInfo) error {
func (i *indexer) ftLoop() {
ticker := time.NewTicker(ftIndexInterval)
i.ftIndex()
var lastForceIndex time.Time
i.mu.Lock()
quit := i.quit
i.mu.Unlock()
@ -738,6 +748,11 @@ func (i *indexer) ftLoop() {
return
case <-ticker.C:
i.ftIndex()
case <-i.forceFt:
if time.Since(lastForceIndex) > ftIndexForceMinInterval {
i.ftIndex()
lastForceIndex = time.Now()
}
}
}
}

View file

@ -2,6 +2,7 @@ package core
import (
"fmt"
"github.com/anytypeio/go-anytype-middleware/core/indexer"
"strings"
"time"
@ -110,7 +111,9 @@ func (mw *Middleware) ObjectSearch(req *pb.RpcObjectSearchRequest) *pb.RpcObject
}
at := mw.app.MustComponent(core.CName).(core.Service)
if req.FullText != "" {
mw.app.MustComponent(indexer.CName).(indexer.Indexer).ForceFTIndex()
}
records, _, err := at.ObjectStore().Query(nil, database.Query{
Filters: req.Filters,
Sorts: req.Sorts,

1
go.mod
View file

@ -11,6 +11,7 @@ require (
github.com/anytypeio/go-slip21 v0.0.0-20200218204727-e2e51e20ab51
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de
github.com/blevesearch/bleve/v2 v2.3.0 // indirect
github.com/blevesearch/bleve_index_api v1.0.1 // indirect
github.com/cheggaaa/mb v1.0.3
github.com/dave/jennifer v1.4.1
github.com/dgraph-io/badger v1.6.2

View file

@ -1,26 +1,34 @@
package ftsearch
import (
"github.com/anytypeio/go-anytype-middleware/app"
"github.com/anytypeio/go-anytype-middleware/core/wallet"
"github.com/anytypeio/go-anytype-middleware/metrics"
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
"os"
"path/filepath"
"strings"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/standard"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/anytypeio/go-anytype-middleware/app"
"github.com/anytypeio/go-anytype-middleware/core/wallet"
"github.com/anytypeio/go-anytype-middleware/metrics"
)
const (
CName = "fts"
ftsDir = "fts"
ftsVer = "1"
)
type SearchDoc struct {
Id string
Title string
Text string
Id string
Title string
TitleNoTerms string
Text string
}
func New() FTSearch {
@ -37,13 +45,17 @@ type FTSearch interface {
}
type ftSearch struct {
path string
index bleve.Index
rootPath string
ftsPath string
index bleve.Index
enStopWordsMap map[string]bool
}
func (f *ftSearch) Init(a *app.App) (err error) {
repoPath := a.MustComponent(wallet.CName).(wallet.Wallet).RepoPath()
f.path = filepath.Join(repoPath, ftsDir)
f.rootPath = filepath.Join(repoPath, ftsDir)
f.ftsPath = filepath.Join(repoPath, ftsDir, ftsVer)
f.enStopWordsMap, _ = en.TokenMapConstructor(nil, nil)
return nil
}
@ -52,60 +64,131 @@ func (f *ftSearch) Name() (name string) {
}
func (f *ftSearch) Run() (err error) {
f.index, err = bleve.Open(f.path)
f.index, err = bleve.Open(f.ftsPath)
if err == bleve.ErrorIndexPathDoesNotExist || err == bleve.ErrorIndexMetaMissing {
mapping := bleve.NewIndexMapping()
if f.index, err = bleve.New(f.path, mapping); err != nil {
if f.index, err = bleve.New(f.ftsPath, f.makeMapping()); err != nil {
return
}
// cleanup old indexes
if strings.HasSuffix(f.rootPath, ftsDir) {
de, e := os.ReadDir(f.rootPath)
if e == nil {
// cleanup old index versions
for _, d := range de {
if d.Name() != ftsVer {
os.RemoveAll(filepath.Join(f.rootPath, d.Name()))
}
}
}
}
} else if err != nil {
return
}
return nil
}
func (f *ftSearch) makeMapping() mapping.IndexMapping {
mapping := bleve.NewIndexMapping()
keywordMapping := bleve.NewTextFieldMapping()
keywordMapping.Analyzer = "noTerms"
mapping.DefaultMapping.AddFieldMappingsAt("TitleNoTerms", keywordMapping)
mapping.DefaultMapping.AddFieldMappingsAt("Id", keywordMapping)
standardMapping := bleve.NewTextFieldMapping()
standardMapping.Analyzer = standard.Name
mapping.DefaultMapping.AddFieldMappingsAt("Title", standardMapping)
mapping.DefaultMapping.AddFieldMappingsAt("Text", standardMapping)
mapping.AddCustomAnalyzer("noTerms",
map[string]interface{}{
"type": custom.Name,
"tokenizer": single.Name,
"token_filters": []string{
lowercase.Name,
},
})
return mapping
}
func (f *ftSearch) Index(d SearchDoc) (err error) {
metrics.ObjectFTUpdatedCounter.Inc()
d.TitleNoTerms = d.Title
return f.index.Index(d.Id, d)
}
func (f *ftSearch) Search(text string) (results []string, err error) {
text = strings.ToLower(strings.TrimSpace(text))
var queries = make([]query.Query, 0, 4)
terms := append([]string{text}, strings.Split(text, " ")...)
termsFiltered := terms[:0]
// id match
if len(text) > 10 {
im := bleve.NewMatchQuery(text)
im.SetField("Id")
im.SetBoost(30)
queries = append(queries, im)
for _, t := range terms {
t = strings.TrimSpace(t)
if t != "" && !f.enStopWordsMap[t] {
termsFiltered = append(termsFiltered, t)
}
}
terms = termsFiltered
var exactQueries = make([]query.Query, 0, 4)
// id match
if len(text) > 5 {
im := bleve.NewDocIDQuery([]string{text})
im.SetBoost(30)
exactQueries = append(exactQueries, im)
}
// title prefix
tp := bleve.NewPrefixQuery(text)
tp.SetField("Title")
tp.SetField("TitleNoTerms")
tp.SetBoost(40)
queries = append(queries, tp)
exactQueries = append(exactQueries, tp)
// title substr
tss := bleve.NewWildcardQuery("*" + strings.ReplaceAll(text, "*", `\*`) + "*")
tss.SetField("Title")
tss.SetField("TitleNoTerms")
tss.SetBoost(8)
queries = append(queries, tss)
// title match
tm := bleve.NewMatchQuery(text)
tm.SetFuzziness(1)
tm.SetField("Title")
tm.SetBoost(7)
queries = append(queries, tm)
// text match
txtm := bleve.NewMatchQuery(text)
txtm.SetFuzziness(0)
txtm.SetField("Text")
queries = append(queries, txtm)
exactQueries = append(exactQueries, tss)
sr := bleve.NewSearchRequest(bleve.NewDisjunctionQuery(queries...))
var notExactQueriesGroup = make([]query.Query, 0, 5)
for i, t := range terms {
// fulltext queries
var notExactQueries = make([]query.Query, 0, 3)
tp = bleve.NewPrefixQuery(t)
tp.SetField("Title")
if i == 0 {
tp.SetBoost(8)
}
notExactQueries = append(notExactQueries, tp)
// title match
tm := bleve.NewMatchQuery(t)
tm.SetFuzziness(1)
tm.SetField("Title")
if i == 0 {
tm.SetBoost(7)
}
notExactQueries = append(notExactQueries, tm)
// text match
txtm := bleve.NewMatchQuery(t)
txtm.SetFuzziness(0)
txtm.SetField("Text")
if i == 0 {
txtm.SetBoost(2)
}
notExactQueries = append(notExactQueries, txtm)
notExactQueriesGroup = append(notExactQueriesGroup, bleve.NewDisjunctionQuery(notExactQueries...))
}
//exactQueries = []query.Query{bleve.NewDisjunctionQuery(notExactQueriesGroup...)}
exactQueries = append(exactQueries, bleve.NewConjunctionQuery(notExactQueriesGroup...))
sr := bleve.NewSearchRequest(bleve.NewDisjunctionQuery(exactQueries...))
sr.Size = 100
sr.Explain = true
res, err := f.index.Search(sr)
//fmt.Println(res.String())
if err != nil {
return
}

View file

@ -52,93 +52,3 @@ func TestNewFTSearch(t *testing.T) {
assert.Len(t, res, 1)
ft.Close()
}
func TestFtSearch_Search(t *testing.T) {
tmpDir, _ := ioutil.TempDir("", "")
fixture := newFixture(tmpDir, t)
ft := fixture.ft
defer ft.Close()
var docs = [...]SearchDoc{
{
Id: "1",
Title: "First one",
Text: "one two two",
},
{
Id: "2",
Title: "Second two",
Text: "one two three",
},
{
Id: "3",
Title: "Third three",
Text: "some text with 3",
},
{
Id: "4",
Title: "Fours four",
Text: "some text with four and some text five",
},
{
Id: "5",
Title: "Fives five",
Text: "some text with five and one and two ans rs",
},
{
Id: "6",
Title: "Rs six some",
Text: "some text with six",
},
{
Id: "somelongidentifier",
},
{
Id: "eczq5t",
Title: "FERRARI styling CENter with somethinglong ",
},
}
for _, d := range docs {
require.NoError(t, ft.Index(d))
}
searches := [...]struct {
Query string
Result []string
}{
{
"one",
[]string{"1", "2", "5"},
},
{
"rs",
[]string{"6", "1", "4", "5"},
},
{
"two",
[]string{"2", "1", "5"},
},
{
"six",
[]string{"6"},
},
{
"some text",
[]string{"4", "3", "6", "5"},
},
{
"somelongidentifier",
[]string{"somelongidentifier"},
},
{
"FeRRa",
[]string{"eczq5t"},
},
}
for _, st := range searches {
ids, err := ft.Search(st.Query)
require.NoError(t, err)
assert.Equal(t, st.Result, ids, st.Query)
t.Logf("%s:\t%v\t%v", st.Query, ids, st.Result)
}
}