1
0
Fork 0
mirror of https://github.com/anyproto/anytype-heart.git synced 2025-06-08 05:47:07 +09:00

GO-4472 Improve prefix search (#1889)

This commit is contained in:
Mikhail 2024-12-06 15:54:29 +01:00 committed by GitHub
parent 8609dca7ff
commit 8f635d0627
Signed by: github
GPG key ID: B5690EEEBB952194
6 changed files with 142 additions and 141 deletions

2
go.mod
View file

@ -13,7 +13,7 @@ require (
github.com/anyproto/go-naturaldate/v2 v2.0.2-0.20230524105841-9829cfd13438
github.com/anyproto/lexid v0.0.4
github.com/anyproto/protobuf v1.3.3-0.20240814124528-72b8c7e0e0f5
github.com/anyproto/tantivy-go v0.2.0
github.com/anyproto/tantivy-go v0.3.0
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de
github.com/avast/retry-go/v4 v4.6.0
github.com/chai2010/webp v1.1.2-0.20240612091223-aa1b379218b7

4
go.sum
View file

@ -106,8 +106,8 @@ github.com/anyproto/protobuf v1.3.3-0.20240814124528-72b8c7e0e0f5 h1:aY7tBzQ+z8H
github.com/anyproto/protobuf v1.3.3-0.20240814124528-72b8c7e0e0f5/go.mod h1:5+PHE01DgsDPkralb8MYmGg2sPQahsqEJ9ue7ciDHKg=
github.com/anyproto/ristretto v0.1.2-0.20240221153107-2b23839cc50c h1:GicoaTUyB2mtCIl3YMrO0OzysqRT5GA4vuvDsqEkhSM=
github.com/anyproto/ristretto v0.1.2-0.20240221153107-2b23839cc50c/go.mod h1:S1GPSBCYCIhmVNfcth17y2zZtQT6wzkzgwUve0VDWWA=
github.com/anyproto/tantivy-go v0.2.0 h1:+b778sOPy07KpJULL4ztCc106L+zdmwKCIB5UYnWrZo=
github.com/anyproto/tantivy-go v0.2.0/go.mod h1:MMLYW7e5SIzsHS3Q5CYiF1J7kJJaIRT+VVHGArU24IQ=
github.com/anyproto/tantivy-go v0.3.0 h1:VHZ8+EnlndFbjs8pBoqvtYk+zq8l/QXkZwugkDwN48g=
github.com/anyproto/tantivy-go v0.3.0/go.mod h1:7hhkPpyTq7+W1dx9Dcva4bsg4TLHq9xqmmYLCwqDq/k=
github.com/anyproto/zeroconf/v2 v2.2.1-0.20240228113933-f90a5cc4439d h1:5bj7nX/AS8sxGpTIrapE7PC4oPlhkHMwMqXlJbUHBlg=
github.com/anyproto/zeroconf/v2 v2.2.1-0.20240228113933-f90a5cc4439d/go.mod h1:fuJqLnUwZTshS3U/bMRJ3+ow/v9oid1n0DmyYyNO1Xs=
github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=

View file

@ -21,10 +21,10 @@ import (
"strings"
"sync"
"time"
"unicode"
"github.com/anyproto/any-sync/app"
tantivy "github.com/anyproto/tantivy-go"
"github.com/samber/lo"
"github.com/valyala/fastjson"
"github.com/anyproto/anytype-heart/core/wallet"
@ -37,7 +37,7 @@ const (
CName = "fts"
ftsDir = "fts"
ftsDir2 = "fts_tantivy"
ftsVer = "9"
ftsVer = "10"
docLimit = 10000
fieldTitle = "Title"
@ -62,7 +62,7 @@ type FTSearch interface {
NewAutoBatcher() AutoBatcher
BatchIndex(ctx context.Context, docs []SearchDoc, deletedDocs []string) (err error)
BatchDeleteObjects(ids []string) (err error)
Search(spaceIds []string, query string) (results []*DocumentMatch, err error)
Search(spaceIds string, query string) (results []*DocumentMatch, err error)
Iterate(objectId string, fields []string, shouldContinue func(doc *SearchDoc) bool) (err error)
DeleteObject(id string) error
DocCount() (uint64, error)
@ -169,7 +169,7 @@ func (f *ftSearchTantivy) Run(context.Context) error {
}
err = builder.AddTextField(
fieldId,
fieldId, // 0
true,
true,
false,
@ -178,7 +178,7 @@ func (f *ftSearchTantivy) Run(context.Context) error {
)
err = builder.AddTextField(
fieldIdRaw,
fieldIdRaw, // 1
true,
true,
true,
@ -187,7 +187,7 @@ func (f *ftSearchTantivy) Run(context.Context) error {
)
err = builder.AddTextField(
fieldSpace,
fieldSpace, // 2
true,
false,
true,
@ -196,25 +196,7 @@ func (f *ftSearchTantivy) Run(context.Context) error {
)
err = builder.AddTextField(
fieldTitle,
true,
true,
false,
tantivy.IndexRecordOptionWithFreqsAndPositions,
tantivy.TokenizerNgram,
)
err = builder.AddTextField(
fieldTitleZh,
true,
true,
false,
tantivy.IndexRecordOptionWithFreqsAndPositions,
tantivy.TokenizerJieba,
)
err = builder.AddTextField(
fieldText,
fieldTitle, // 3
true,
true,
false,
@ -223,7 +205,25 @@ func (f *ftSearchTantivy) Run(context.Context) error {
)
err = builder.AddTextField(
fieldTextZh,
fieldTitleZh, // 4
true,
true,
false,
tantivy.IndexRecordOptionWithFreqsAndPositions,
tantivy.TokenizerJieba,
)
err = builder.AddTextField(
fieldText, // 5
true,
true,
false,
tantivy.IndexRecordOptionWithFreqsAndPositions,
tantivy.TokenizerSimple,
)
err = builder.AddTextField(
fieldTextZh, // 6
true,
true,
false,
@ -366,28 +366,52 @@ func (f *ftSearchTantivy) BatchIndex(ctx context.Context, docs []SearchDoc, dele
return f.index.AddAndConsumeDocuments(tantivyDocs...)
}
func (f *ftSearchTantivy) Search(spaceIds []string, query string) (results []*DocumentMatch, err error) {
spaceIdsQuery := getSpaceIdsQuery(spaceIds)
func (f *ftSearchTantivy) Search(spaceId string, query string) (results []*DocumentMatch, err error) {
query = prepareQuery(query)
if query == "" {
return nil, nil
}
if spaceIdsQuery != "" {
query = fmt.Sprintf("%s AND %s", spaceIdsQuery, query)
qb := tantivy.NewQueryBuilder()
if len(spaceId) != 0 {
qb.Query(tantivy.Must, fieldSpace, spaceId, tantivy.TermQuery, 1.0)
}
if containsChineseCharacters(query) {
qb.BooleanQuery(tantivy.Must, qb.NestedBuilder().
Query(tantivy.Should, fieldTitleZh, query, tantivy.PhrasePrefixQuery, 5.0).
Query(tantivy.Should, fieldTitleZh, query, tantivy.PhraseQuery, 5.0).
Query(tantivy.Should, fieldTitleZh, query, tantivy.EveryTermQuery, 0.75).
Query(tantivy.Should, fieldTitleZh, query, tantivy.OneOfTermQuery, 0.5).
Query(tantivy.Should, fieldTextZh, query, tantivy.PhrasePrefixQuery, 1.0).
Query(tantivy.Should, fieldTextZh, query, tantivy.PhraseQuery, 1.0).
Query(tantivy.Should, fieldTextZh, query, tantivy.EveryTermQuery, 0.5).
Query(tantivy.Should, fieldTextZh, query, tantivy.OneOfTermQuery, 0.25),
1.0,
)
} else {
qb.BooleanQuery(tantivy.Must, qb.NestedBuilder().
Query(tantivy.Should, fieldTitle, query, tantivy.PhrasePrefixQuery, 10.0).
Query(tantivy.Should, fieldTitle, query, tantivy.PhraseQuery, 10.0).
Query(tantivy.Should, fieldTitle, query, tantivy.EveryTermQuery, 0.75).
Query(tantivy.Should, fieldTitle, query, tantivy.OneOfTermQuery, 0.5).
Query(tantivy.Should, fieldText, query, tantivy.PhrasePrefixQuery, 1.0).
Query(tantivy.Should, fieldText, query, tantivy.PhraseQuery, 1.0).
Query(tantivy.Should, fieldText, query, tantivy.EveryTermQuery, 0.5).
Query(tantivy.Should, fieldText, query, tantivy.OneOfTermQuery, 0.25),
1.0,
)
}
finalQuery := qb.Build()
sCtx := tantivy.NewSearchContextBuilder().
SetQuery(query).
SetQueryFromJson(&finalQuery).
SetDocsLimit(100).
SetWithHighlights(true).
AddFieldDefaultWeight(fieldId).
AddFieldDefaultWeight(fieldSpace).
AddField(fieldTitle, 10.0).
AddField(fieldTitleZh, 10.0).
AddFieldDefaultWeight(fieldText).
AddFieldDefaultWeight(fieldTextZh).
Build()
result, err := f.index.Search(sCtx)
result, err := f.index.SearchJson(sCtx)
if err != nil {
return nil, wrapError(err)
}
@ -432,6 +456,15 @@ func (f *ftSearchTantivy) Search(spaceIds []string, query string) (results []*Do
)
}
func containsChineseCharacters(s string) bool {
for _, r := range s {
if unicode.Is(unicode.Han, r) { // Проверка на принадлежность к диапазону Han
return true
}
}
return false
}
func extractHighlight(object *fastjson.Object, fragments map[string]*Highlight, fieldName string) {
highlightObj := object.Get(fragment)
if highlightObj == nil {
@ -459,26 +492,6 @@ func wrapError(err error) error {
return err
}
func getSpaceIdsQuery(ids []string) string {
ids = lo.Filter(ids, func(item string, index int) bool { return item != "" })
if len(ids) == 0 || lo.EveryBy(ids, func(id string) bool { return id == "" }) {
return ""
}
var builder strings.Builder
var sep string
builder.WriteString("(")
for _, id := range ids {
builder.WriteString(sep)
builder.WriteString(fieldSpace)
builder.WriteString(":")
builder.WriteString(id)
sep = " OR "
}
builder.WriteString(")")
return builder.String()
}
func (f *ftSearchTantivy) Delete(id string) error {
return f.BatchDeleteObjects([]string{id})
}
@ -505,17 +518,5 @@ func prepareQuery(query string) string {
query = text.Truncate(query, 100, "")
query = strings.ToLower(query)
query = strings.TrimSpace(query)
var escapedQuery strings.Builder
for _, char := range query {
if _, found := specialChars[char]; !found {
escapedQuery.WriteRune(char)
}
}
resultQuery := escapedQuery.String()
if resultQuery == "" {
return resultQuery
}
return "(\"" + resultQuery + "\" OR " + resultQuery + ")"
return query
}

View file

@ -4,7 +4,6 @@ import (
"context"
"fmt"
"os"
"strings"
"testing"
"github.com/anyproto/any-sync/app"
@ -76,23 +75,15 @@ func TestDifferentSpaces(t *testing.T) {
SpaceId: "space2",
}))
search, err := ft.Search([]string{"space1"}, "one")
search, err := ft.Search("space1", "one")
require.NoError(t, err)
require.Len(t, search, 1)
search, err = ft.Search([]string{"space2"}, "one")
search, err = ft.Search("space2", "one")
require.NoError(t, err)
require.Len(t, search, 1)
search, err = ft.Search([]string{"space1", "space2"}, "one")
require.NoError(t, err)
require.Len(t, search, 2)
search, err = ft.Search([]string{""}, "one")
require.NoError(t, err)
require.Len(t, search, 2)
search, err = ft.Search(nil, "one")
search, err = ft.Search("", "one")
require.NoError(t, err)
require.Len(t, search, 2)
@ -115,7 +106,12 @@ func TestNewFTSearch(t *testing.T) {
{
name: "assertFoundCaseSensitivePartsOfTheWords",
tester: assertFoundCaseSensitivePartsOfTheWords,
}, {
},
{
name: "assertPrefix",
tester: assertPrefix,
},
{
name: "assertChineseFound",
tester: assertChineseFound,
},
@ -133,6 +129,50 @@ func TestNewFTSearch(t *testing.T) {
}
}
func assertPrefix(t *testing.T, tmpDir string) {
fixture := newFixture(tmpDir, t)
ft := fixture.ft
require.NoError(t, ft.Index(SearchDoc{
Id: "1",
Title: "I love my mum",
Text: "",
}))
require.NoError(t, ft.Index(SearchDoc{
Id: "2",
Title: "",
Text: "Something completely different",
}))
require.NoError(t, ft.Index(SearchDoc{
Id: "4",
Title: "Just random filler",
Text: "",
}))
require.NoError(t, ft.Index(SearchDoc{
Id: "4",
Title: "Another text for fun",
Text: "",
}))
validateSearch(t, ft, "", "I love", 1)
validateSearch(t, ft, "", "I lo", 1)
validateSearch(t, ft, "", "I", 1)
validateSearch(t, ft, "", "lov", 1)
validateSearch(t, ft, "", "Something", 1)
validateSearch(t, ft, "", "Some", 1)
validateSearch(t, ft, "", "comp", 1)
validateSearch(t, ft, "", "diff", 1)
validateSearch(t, ft, "", "Something c", 1)
validateSearch(t, ft, "", "Something different", 1)
validateSearch(t, ft, "", "different something", 1)
_ = ft.Close(nil)
}
func assertFoundCaseSensitivePartsOfTheWords(t *testing.T, tmpDir string) {
fixture := newFixture(tmpDir, t)
ft := fixture.ft
@ -252,7 +292,7 @@ func assertSearch(t *testing.T, tmpDir string) {
}
func validateSearch(t *testing.T, ft FTSearch, spaceID, qry string, times int) {
res, err := ft.Search([]string{spaceID}, qry)
res, err := ft.Search(spaceID, qry)
require.NoError(t, err)
assert.Len(t, res, times)
}
@ -292,51 +332,11 @@ func assertMultiSpace(t *testing.T, tmpDir string) {
validateSearch(t, ft, "", "Advanced", 1)
validateSearch(t, ft, "", "dash", 2)
validateSearch(t, ft, "", "space", 4)
validateSearch(t, ft, "", "of", 5)
validateSearch(t, ft, "", "of", 4)
_ = ft.Close(nil)
}
func TestEscapeQuery(t *testing.T) {
tests := []struct {
input string
expected string
}{
{strings.Repeat("a", 99) + " aa", `("` + strings.Repeat("a", 99) + `" OR ` + strings.Repeat("a", 99) + `)`},
{`""`, ``},
{"simpleQuery", `("simplequery" OR simplequery)`},
{"with+special^chars", `("withspecialchars" OR withspecialchars)`},
{"text`with:brackets{}", `("textwithbrackets" OR textwithbrackets)`},
{"escaped[]symbols()", `("escapedsymbols" OR escapedsymbols)`},
{"multiple!!special~~", `("multiplespecial" OR multiplespecial)`},
}
for _, test := range tests {
actual := prepareQuery(test.input)
if actual != test.expected {
t.Errorf("For input '%s', expected '%s', but got '%s'", test.input, test.expected, actual)
}
}
}
// Tests
func TestGetSpaceIdsQuery(t *testing.T) {
// Test with empty slice of ids
assert.Equal(t, "", getSpaceIdsQuery([]string{}))
// Test with slice containing only empty strings
assert.Equal(t, "", getSpaceIdsQuery([]string{"", "", ""}))
// Test with a single id
assert.Equal(t, "(SpaceID:123)", getSpaceIdsQuery([]string{"123"}))
// Test with multiple ids
assert.Equal(t, "(SpaceID:123 OR SpaceID:456 OR SpaceID:789)", getSpaceIdsQuery([]string{"123", "456", "789"}))
// Test with some empty ids
assert.Equal(t, "(SpaceID:123 OR SpaceID:789)", getSpaceIdsQuery([]string{"123", "", "789"}))
}
func TestFtSearch_Close(t *testing.T) {
// given
fts := new(ftSearchTantivy)

View file

@ -281,7 +281,7 @@ func (s *dsObjectStore) performQuery(q database.Query) (records []database.Recor
}
func (s *dsObjectStore) performFulltextSearch(text string, spaceId string) ([]database.FulltextResult, error) {
ftsResults, err := s.fts.Search([]string{spaceId}, text)
ftsResults, err := s.fts.Search(spaceId, text)
if err != nil {
return nil, fmt.Errorf("fullText search: %w", err)
}

View file

@ -1,11 +1,11 @@
b9ce3724a4b77c7d08d74843c08c0961ccbff8d446c6186ce0e7fd484bb1cc04 deps/libs/android-386.tar.gz
902dc8730451ad3e0e21d1d59cea3a833bb98bb7dc944b4263e5dbe955835101 deps/libs/android-amd64.tar.gz
9cd93ffd7ec2ab96ef0c3720ef30a1c1f290b785b0764cc9aae1f4eceaf128ca deps/libs/android-arm.tar.gz
220b5876c642080f39e23a0033e4b853d749c38b02b19c8b605571c53f05f833 deps/libs/android-arm64.tar.gz
d691873e86e6677e7df748982f274a0bfea92e6ac6e558c5c5d542263944436b deps/libs/darwin-amd64.tar.gz
83f4360fd53777805ba349924ade7fb1ebce00818bc2e37da87ccf7f8c53f024 deps/libs/darwin-arm64.tar.gz
a22f251a7b12152503d6b175e57c47f9117cccd0e3182e4febedda6615fcbd36 deps/libs/ios-amd64.tar.gz
1875f72dd8d87d833639d1ffef63d6a9e3a6175e442f1cf2954973b81f676125 deps/libs/ios-arm64.tar.gz
e873c9439307e5bacbe48a03bbc112a5550b5d907567950bd9cc33728d4f8cd3 deps/libs/ios-arm64-sim.tar.gz
63bf26424e423230d7f4a267d103ff91db8b5e9ad24916b104190f0ae006d480 deps/libs/linux-amd64-musl.tar.gz
d4269f6bc1fd70e8e15b4f396d9bb1ba046835ba75731d12071a9d079ec0ce4d deps/libs/windows-amd64.tar.gz
8bb5b1d7dc93a95f3b29450531304581ab9201e89238986d69a55df6afa54f65 deps/libs/android-386.tar.gz
0aa7d66c25ce31af1831508f4c1555c5632a6bcae57d5218ddbaf9215ee9c056 deps/libs/android-amd64.tar.gz
7a5f486924256e7ad86f76c85c003f02e181ab46aeb7b25a1c9ab6495b5cdee5 deps/libs/android-arm.tar.gz
266994bd53b14c571a685c9a2a93dca3c131b14d26b0b5493f359093bdb43388 deps/libs/android-arm64.tar.gz
267738043946821174e6682fb887f46fdc8ba2043c23e2b6552242380e945f7b deps/libs/darwin-amd64.tar.gz
40b09e38f1ec50b296df288b2f208f871a5f7420a8c1e797ba184d4303c2b4f2 deps/libs/darwin-arm64.tar.gz
2383e45089229d2deec53fc77bad522d6268581856064ce92980905f3a8ad71c deps/libs/ios-amd64.tar.gz
4e75cfe16e64e86cedf004dc2138c0bc6d6d8465a212ac245b777de666375c3f deps/libs/ios-arm64.tar.gz
aa43f85be9d6b9f8ee791775960814b3a1d9ffc313a6fd0bdad06e67f9b7cf87 deps/libs/ios-arm64-sim.tar.gz
94fecdb1703f7b89dab10b4bcdf802bd210cbaaf0acd53d09a17206f480037fa deps/libs/linux-amd64-musl.tar.gz
6316199ff84a23e4509e81af8e937534318338cb8c50d3726769af179abd0b1a deps/libs/windows-amd64.tar.gz