1
0
Fork 0
mirror of https://github.com/anyproto/anytype-heart.git synced 2025-06-09 17:44:59 +09:00

Merge pull request #1917 from anyproto/go-4573-importing-markdown-files-named-in-japanese-or-chinese

GO-4573: Importing markdown files named in Japanese or Chinese, references to these files won't resolve properly
This commit is contained in:
Anastasia Shemyakinskaya 2024-12-09 10:46:39 +01:00 committed by GitHub
commit ec00669550
Signed by: github
GPG key ID: B5690EEEBB952194
12 changed files with 215 additions and 38 deletions

View file

@ -12,13 +12,18 @@ import (
"github.com/anyproto/anytype-heart/util/anyerror"
)
type OriginalFileNameGetter interface {
GetFileOriginalName(filename string) string
}
type Zip struct {
archiveReader *zip.ReadCloser
fileReaders map[string]*zip.File
archiveReader *zip.ReadCloser
fileReaders map[string]*zip.File
originalToNormalizedNames map[string]string
}
func NewZip() *Zip {
return &Zip{fileReaders: make(map[string]*zip.File, 0)}
return &Zip{fileReaders: make(map[string]*zip.File), originalToNormalizedNames: make(map[string]string)}
}
func (z *Zip) Initialize(importPath string) error {
@ -32,7 +37,11 @@ func (z *Zip) Initialize(importPath string) error {
if strings.HasPrefix(f.Name, "__MACOSX/") {
continue
}
fileReaders[normalizeName(f, i)] = f
normalizedName := normalizeName(f, i)
fileReaders[normalizedName] = f
if normalizedName != f.Name {
z.originalToNormalizedNames[f.Name] = normalizedName
}
}
z.fileReaders = fileReaders
return nil
@ -94,3 +103,10 @@ func (z *Zip) Close() {
func (z *Zip) IsRootFile(fileName string) bool {
return filepath.Dir(fileName) == "."
}
func (z *Zip) GetFileOriginalName(fileName string) string {
if originalName, ok := z.originalToNormalizedNames[fileName]; ok {
return originalName
}
return fileName
}

View file

@ -2,18 +2,27 @@ package test
import (
"archive/zip"
"fmt"
"io"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
)
func CreateEmptyZip(t *testing.T, zipFileName string) error {
func CreateEmptyZip(t *testing.T, zipFileName string) {
zipFile, err := os.Create(zipFileName)
if err != nil {
return fmt.Errorf("Failed to create zip file: %w\n", err)
}
assert.NoError(t, err)
defer zipFile.Close()
zipWriter := zip.NewWriter(zipFile)
err = zipWriter.Close()
assert.NoError(t, err)
}
func CreateZipWithFiles(t *testing.T, zipFileName, testDataDir string, files []*zip.FileHeader) {
zipFile, err := os.Create(zipFileName)
assert.NoError(t, err)
defer zipFile.Close()
zipWriter := zip.NewWriter(zipFile)
@ -21,5 +30,13 @@ func CreateEmptyZip(t *testing.T, zipFileName string) error {
err = zipWriter.Close()
assert.NoError(t, err)
}()
return nil
for _, file := range files {
writer, err := zipWriter.CreateHeader(file)
assert.NoError(t, err)
fileReader, err := os.Open(filepath.Join(testDataDir, file.Name))
assert.NoError(t, err)
_, err = io.Copy(writer, fileReader)
assert.NoError(t, err)
}
}

View file

@ -819,8 +819,7 @@ func TestCSV_GetSnapshots(t *testing.T) {
// given
dir := t.TempDir()
zipPath := filepath.Join(dir, "empty.zip")
err := test.CreateEmptyZip(t, zipPath)
assert.Nil(t, err)
test.CreateEmptyZip(t, zipPath)
csv := CSV{}
p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})

View file

@ -61,8 +61,7 @@ func TestHTML_GetSnapshots(t *testing.T) {
// given
dir := t.TempDir()
zipPath := filepath.Join(dir, "empty.zip")
err := test.CreateEmptyZip(t, zipPath)
assert.Nil(t, err)
test.CreateEmptyZip(t, zipPath)
html := HTML{}
p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})

View file

@ -56,7 +56,7 @@ func (m *mdConverter) processFiles(importPath string, allErrors *common.ConvertE
}
fileInfo := m.getFileInfo(importSource, allErrors)
for name, file := range fileInfo {
m.processBlocks(name, file, fileInfo)
m.processBlocks(name, file, fileInfo, importSource)
for _, b := range file.ParsedBlocks {
m.processFileBlock(b, importSource, importPath)
}
@ -89,36 +89,38 @@ func (m *mdConverter) fillFilesInfo(importSource source.Source, fileInfo map[str
return nil
}
func (m *mdConverter) processBlocks(shortPath string, file *FileInfo, files map[string]*FileInfo) {
func (m *mdConverter) processBlocks(shortPath string, file *FileInfo, files map[string]*FileInfo, importSource source.Source) {
for _, block := range file.ParsedBlocks {
m.processTextBlock(block, files)
m.processTextBlock(block, files, importSource)
}
m.processLinkBlock(shortPath, file, files)
}
func (m *mdConverter) processTextBlock(block *model.Block, files map[string]*FileInfo) {
func (m *mdConverter) processTextBlock(block *model.Block, files map[string]*FileInfo, importSource source.Source) {
txt := block.GetText()
if txt != nil && txt.Marks != nil {
if len(txt.Marks.Marks) == 1 && txt.Marks.Marks[0].Type == model.BlockContentTextMark_Link {
m.handleSingleMark(block, files)
m.handleSingleMark(block, files, importSource)
} else {
m.handleMultipleMarks(block, files)
m.handleMultipleMarks(block, files, importSource)
}
}
}
func (m *mdConverter) handleSingleMark(block *model.Block, files map[string]*FileInfo) {
func (m *mdConverter) handleSingleMark(block *model.Block, files map[string]*FileInfo, importSource source.Source) {
txt := block.GetText()
link := txt.Marks.Marks[0].Param
wholeLineLink := m.isWholeLineLink(txt.Text, txt.Marks.Marks[0])
ext := filepath.Ext(link)
ext := filepath.Ext(txt.Marks.Marks[0].Param)
link := m.getOriginalName(txt.Marks.Marks[0].Param, importSource)
if file := files[link]; file != nil {
if strings.EqualFold(ext, ".csv") {
txt.Marks.Marks[0].Param = link
m.processCSVFileLink(block, files, link, wholeLineLink)
return
}
if strings.EqualFold(ext, ".md") {
// only convert if this is the only link in the row
txt.Marks.Marks[0].Param = link
m.convertToAnytypeLinkBlock(block, wholeLineLink)
} else {
block.Content = anymark.ConvertTextToFile(txt.Marks.Marks[0].Param)
@ -129,31 +131,33 @@ func (m *mdConverter) handleSingleMark(block *model.Block, files map[string]*Fil
}
}
func (m *mdConverter) handleMultipleMarks(block *model.Block, files map[string]*FileInfo) {
func (m *mdConverter) handleMultipleMarks(block *model.Block, files map[string]*FileInfo, importSource source.Source) {
txt := block.GetText()
for _, mark := range txt.Marks.Marks {
if mark.Type == model.BlockContentTextMark_Link {
if stop := m.handleSingleLinkMark(block, files, mark, txt); stop {
if stop := m.handleSingleLinkMark(block, files, mark, txt, importSource); stop {
return
}
}
}
}
func (m *mdConverter) handleSingleLinkMark(block *model.Block, files map[string]*FileInfo, mark *model.BlockContentTextMark, txt *model.BlockContentText) bool {
link := mark.Param
func (m *mdConverter) handleSingleLinkMark(block *model.Block, files map[string]*FileInfo, mark *model.BlockContentTextMark, txt *model.BlockContentText, importSource source.Source) bool {
isWholeLink := m.isWholeLineLink(txt.Text, mark)
link := m.getOriginalName(mark.Param, importSource)
ext := filepath.Ext(link)
if file := files[link]; file != nil {
file.HasInboundLinks = true
if strings.EqualFold(ext, ".md") || strings.EqualFold(ext, ".csv") {
mark.Type = model.BlockContentTextMark_Mention
mark.Param = link
return false
}
if m.isWholeLineLink(txt.Text, mark) {
if isWholeLink {
block.Content = anymark.ConvertTextToFile(mark.Param)
return true
}
} else if m.isWholeLineLink(txt.Text, mark) {
} else if isWholeLink {
m.convertTextToBookmark(mark.Param, block)
return true
}
@ -275,3 +279,10 @@ func (m *mdConverter) createBlocksFromFile(importSource source.Source, filePath
}
return nil
}
func (m *mdConverter) getOriginalName(link string, importSource source.Source) string {
if originalFileNameGetter, ok := importSource.(source.OriginalFileNameGetter); ok {
return originalFileNameGetter.GetFileOriginalName(link)
}
return link
}

View file

@ -42,7 +42,7 @@ func Test_processFiles(t *testing.T) {
files := converter.processFiles(absolutePath, common.NewError(pb.RpcObjectImportRequest_IGNORE_ERRORS), source)
// then
assert.Len(t, files, 6)
assert.Len(t, files, 9)
pdfFilePath := filepath.Join(absolutePath, "test.pdf")
assert.Contains(t, files, pdfFilePath)
@ -74,7 +74,7 @@ func Test_processFiles(t *testing.T) {
files := converter.processFiles(absolutePath, common.NewError(pb.RpcObjectImportRequest_IGNORE_ERRORS), source)
// then
assert.Len(t, files, 4)
assert.Len(t, files, 7)
pdfFilePath := filepath.Join(absolutePath, "test.pdf")
assert.NotContains(t, files, pdfFilePath)

View file

@ -1,6 +1,7 @@
package markdown
import (
"archive/zip"
"context"
"errors"
"os"
@ -97,7 +98,7 @@ func TestMarkdown_GetSnapshots(t *testing.T) {
// then
assert.Nil(t, err)
assert.NotNil(t, sn)
assert.Len(t, sn.Snapshots, 4)
assert.Len(t, sn.Snapshots, 7)
fileNameToObjectId := make(map[string]string, len(sn.Snapshots))
for _, snapshot := range sn.Snapshots {
@ -119,8 +120,7 @@ func TestMarkdown_GetSnapshots(t *testing.T) {
// given
testDirectory := t.TempDir()
zipPath := filepath.Join(testDirectory, "empty.zip")
err := test.CreateEmptyZip(t, zipPath)
assert.Nil(t, err)
test.CreateEmptyZip(t, zipPath)
h := &Markdown{}
p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})
@ -139,6 +139,132 @@ func TestMarkdown_GetSnapshots(t *testing.T) {
assert.Nil(t, sn)
assert.True(t, errors.Is(ce.GetResultError(model.Import_Markdown), common.ErrFileImportNoObjectsInZipArchive))
})
t.Run("import non utf files", func(t *testing.T) {
// given
testDirectory := t.TempDir()
zipPath := filepath.Join(testDirectory, "nonutf.zip")
fileMdName := "こんにちは.md"
fileCsvName := "你好.csv"
fileWithLinksName := "nonutflinks.md"
test.CreateZipWithFiles(t, zipPath, "testdata", []*zip.FileHeader{
{
Name: fileWithLinksName,
Method: zip.Deflate,
},
{
Name: fileMdName,
Method: zip.Deflate,
NonUTF8: true,
},
{
Name: fileCsvName,
Method: zip.Deflate,
NonUTF8: true,
},
})
h := &Markdown{}
p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})
// when
sn, ce := h.GetSnapshots(context.Background(), &pb.RpcObjectImportRequest{
Params: &pb.RpcObjectImportRequestParamsOfMarkdownParams{
MarkdownParams: &pb.RpcObjectImportRequestMarkdownParams{Path: []string{zipPath}},
},
Type: model.Import_Markdown,
Mode: pb.RpcObjectImportRequest_IGNORE_ERRORS,
}, p)
// then
assert.Nil(t, ce)
assert.NotNil(t, sn)
assert.Len(t, sn.Snapshots, 4)
fileNameToObjectId := make(map[string]string, len(sn.Snapshots))
for _, snapshot := range sn.Snapshots {
fileNameToObjectId[snapshot.FileName] = snapshot.Id
}
var found bool
rootId := fileNameToObjectId[fileWithLinksName]
want := buildTreeWithNonUtfLinks(fileNameToObjectId, rootId)
for _, snapshot := range sn.Snapshots {
if snapshot.FileName == fileWithLinksName {
found = true
blockbuilder.AssertTreesEqual(t, want.Build(), snapshot.Snapshot.Data.Blocks)
}
}
assert.True(t, found)
})
}
func buildTreeWithNonUtfLinks(fileNameToObjectId map[string]string, rootId string) *blockbuilder.Block {
testMdPath := fileNameToObjectId["import file 2.md"]
testCsvPath := fileNameToObjectId["import file 3.csv"]
want := blockbuilder.Root(
blockbuilder.ID(rootId),
blockbuilder.Children(
blockbuilder.Text("NonUtf 1 test6", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
{
Range: &model.Range{From: 9, To: 14},
Type: model.BlockContentTextMark_Mention,
Param: testMdPath,
},
}})),
blockbuilder.Text("NonUtf 2 test7", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
{
Range: &model.Range{From: 9, To: 14},
Type: model.BlockContentTextMark_Mention,
Param: testCsvPath,
},
}})),
blockbuilder.Text("NonUtf 1 test6", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
{
Range: &model.Range{From: 9, To: 14},
Type: model.BlockContentTextMark_Mention,
Param: testMdPath,
},
{
Range: &model.Range{From: 9, To: 14},
Type: model.BlockContentTextMark_Bold,
},
}})),
blockbuilder.Text("NonUtf 2 test7", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
{
Range: &model.Range{From: 9, To: 14},
Type: model.BlockContentTextMark_Mention,
Param: testCsvPath,
},
{
Range: &model.Range{From: 9, To: 14},
Type: model.BlockContentTextMark_Bold,
},
}})),
blockbuilder.Text("test6", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
{
Range: &model.Range{From: 0, To: 5},
Type: model.BlockContentTextMark_Mention,
Param: testMdPath,
},
{
Range: &model.Range{From: 0, To: 5},
Type: model.BlockContentTextMark_Bold,
},
}})),
blockbuilder.Text("test7", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
{
Range: &model.Range{From: 0, To: 5},
Type: model.BlockContentTextMark_Mention,
Param: testCsvPath,
},
{
Range: &model.Range{From: 0, To: 5},
Type: model.BlockContentTextMark_Bold,
},
}})),
blockbuilder.Link(rootId),
))
return want
}
func buildExpectedTree(fileNameToObjectId map[string]string, provider *MockTempDir, rootId string) *blockbuilder.Block {

View file

@ -0,0 +1,11 @@
NonUtf 1 [test6](こんにちは.md)
NonUtf 2 [test7](你好.csv)
NonUtf 1 **[test6](こんにちは.md)**
NonUtf 2 **[test7](你好.csv)**
**[test6](こんにちは.md)**
**[test7](你好.csv)**

View file

View file

View file

@ -243,8 +243,7 @@ func TestPb_GetSnapshots(t *testing.T) {
dir := t.TempDir()
p := &Pb{}
zipPath := filepath.Join(dir, "empty.zip")
err := test.CreateEmptyZip(t, zipPath)
assert.Nil(t, err)
test.CreateEmptyZip(t, zipPath)
// when
_, ce := p.GetSnapshots(context.Background(), &pb.RpcObjectImportRequest{

View file

@ -78,8 +78,7 @@ func TestTXT_GetSnapshots(t *testing.T) {
// given
dir := t.TempDir()
zipPath := filepath.Join(dir, "empty.zip")
err := test.CreateEmptyZip(t, zipPath)
assert.Nil(t, err)
test.CreateEmptyZip(t, zipPath)
h := &TXT{}
p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})