Merge pull request #1917 from anyproto/go-4573-importing-markdown-files-named-in-japanese-or-chinese

GO-4573: Importing markdown files named in Japanese or Chinese, references to these files won't resolve properly
2025-06-09 17:44:59 +09:00 · 2024-12-09 10:46:39 +01:00 · 2024-12-09 10:46:39 +01:00 · ec00669550
commit ec00669550
parent efa8c8ddf2 a314557810
12 changed files with 215 additions and 38 deletions
--- a/core/block/import/common/source/zip.go
+++ b/core/block/import/common/source/zip.go
@ -12,13 +12,18 @@ import (
 	"github.com/anyproto/anytype-heart/util/anyerror"
 )

+type OriginalFileNameGetter interface {
+	GetFileOriginalName(filename string) string
+}
+
 type Zip struct {
-	archiveReader *zip.ReadCloser
-	fileReaders   map[string]*zip.File
+	archiveReader             *zip.ReadCloser
+	fileReaders               map[string]*zip.File
+	originalToNormalizedNames map[string]string
 }

 func NewZip() *Zip {
-	return &Zip{fileReaders: make(map[string]*zip.File, 0)}
+	return &Zip{fileReaders: make(map[string]*zip.File), originalToNormalizedNames: make(map[string]string)}
 }

 func (z *Zip) Initialize(importPath string) error {
@ -32,7 +37,11 @@ func (z *Zip) Initialize(importPath string) error {
 		if strings.HasPrefix(f.Name, "__MACOSX/") {
 			continue
 		}
-		fileReaders[normalizeName(f, i)] = f
+		normalizedName := normalizeName(f, i)
+		fileReaders[normalizedName] = f
+		if normalizedName != f.Name {
+			z.originalToNormalizedNames[f.Name] = normalizedName
+		}
 	}
 	z.fileReaders = fileReaders
 	return nil
@ -94,3 +103,10 @@ func (z *Zip) Close() {
 func (z *Zip) IsRootFile(fileName string) bool {
 	return filepath.Dir(fileName) == "."
 }
+
+func (z *Zip) GetFileOriginalName(fileName string) string {
+	if originalName, ok := z.originalToNormalizedNames[fileName]; ok {
+		return originalName
+	}
+	return fileName
+}
--- a/core/block/import/common/test/utils.go
+++ b/core/block/import/common/test/utils.go
@ -2,18 +2,27 @@ package test

 import (
 	"archive/zip"
-	"fmt"
+	"io"
 	"os"
+	"path/filepath"
 	"testing"

 	"github.com/stretchr/testify/assert"
 )

-func CreateEmptyZip(t *testing.T, zipFileName string) error {
+func CreateEmptyZip(t *testing.T, zipFileName string) {
 	zipFile, err := os.Create(zipFileName)
-	if err != nil {
-		return fmt.Errorf("Failed to create zip file: %w\n", err)
-	}
+	assert.NoError(t, err)
+	defer zipFile.Close()
+
+	zipWriter := zip.NewWriter(zipFile)
+	err = zipWriter.Close()
+	assert.NoError(t, err)
+}
+
+func CreateZipWithFiles(t *testing.T, zipFileName, testDataDir string, files []*zip.FileHeader) {
+	zipFile, err := os.Create(zipFileName)
+	assert.NoError(t, err)
 	defer zipFile.Close()

 	zipWriter := zip.NewWriter(zipFile)
@ -21,5 +30,13 @@ func CreateEmptyZip(t *testing.T, zipFileName string) error {
 		err = zipWriter.Close()
 		assert.NoError(t, err)
 	}()
-	return nil
+
+	for _, file := range files {
+		writer, err := zipWriter.CreateHeader(file)
+		assert.NoError(t, err)
+		fileReader, err := os.Open(filepath.Join(testDataDir, file.Name))
+		assert.NoError(t, err)
+		_, err = io.Copy(writer, fileReader)
+		assert.NoError(t, err)
+	}
 }
--- a/core/block/import/csv/converter_test.go
+++ b/core/block/import/csv/converter_test.go
@ -819,8 +819,7 @@ func TestCSV_GetSnapshots(t *testing.T) {
 		// given
 		dir := t.TempDir()
 		zipPath := filepath.Join(dir, "empty.zip")
-		err := test.CreateEmptyZip(t, zipPath)
-		assert.Nil(t, err)
+		test.CreateEmptyZip(t, zipPath)
 		csv := CSV{}
 		p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})

--- a/core/block/import/html/converter_test.go
+++ b/core/block/import/html/converter_test.go
@ -61,8 +61,7 @@ func TestHTML_GetSnapshots(t *testing.T) {
 		// given
 		dir := t.TempDir()
 		zipPath := filepath.Join(dir, "empty.zip")
-		err := test.CreateEmptyZip(t, zipPath)
-		assert.Nil(t, err)
+		test.CreateEmptyZip(t, zipPath)
 		html := HTML{}
 		p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})

--- a/core/block/import/markdown/blockconverter.go
+++ b/core/block/import/markdown/blockconverter.go
@ -56,7 +56,7 @@ func (m *mdConverter) processFiles(importPath string, allErrors *common.ConvertE
 	}
 	fileInfo := m.getFileInfo(importSource, allErrors)
 	for name, file := range fileInfo {
-		m.processBlocks(name, file, fileInfo)
+		m.processBlocks(name, file, fileInfo, importSource)
 		for _, b := range file.ParsedBlocks {
 			m.processFileBlock(b, importSource, importPath)
 		}
@ -89,36 +89,38 @@ func (m *mdConverter) fillFilesInfo(importSource source.Source, fileInfo map[str
 	return nil
 }

-func (m *mdConverter) processBlocks(shortPath string, file *FileInfo, files map[string]*FileInfo) {
+func (m *mdConverter) processBlocks(shortPath string, file *FileInfo, files map[string]*FileInfo, importSource source.Source) {
 	for _, block := range file.ParsedBlocks {
-		m.processTextBlock(block, files)
+		m.processTextBlock(block, files, importSource)
 	}
 	m.processLinkBlock(shortPath, file, files)
 }

-func (m *mdConverter) processTextBlock(block *model.Block, files map[string]*FileInfo) {
+func (m *mdConverter) processTextBlock(block *model.Block, files map[string]*FileInfo, importSource source.Source) {
 	txt := block.GetText()
 	if txt != nil && txt.Marks != nil {
 		if len(txt.Marks.Marks) == 1 && txt.Marks.Marks[0].Type == model.BlockContentTextMark_Link {
-			m.handleSingleMark(block, files)
+			m.handleSingleMark(block, files, importSource)
 		} else {
-			m.handleMultipleMarks(block, files)
+			m.handleMultipleMarks(block, files, importSource)
 		}
 	}
 }

-func (m *mdConverter) handleSingleMark(block *model.Block, files map[string]*FileInfo) {
+func (m *mdConverter) handleSingleMark(block *model.Block, files map[string]*FileInfo, importSource source.Source) {
 	txt := block.GetText()
-	link := txt.Marks.Marks[0].Param
 	wholeLineLink := m.isWholeLineLink(txt.Text, txt.Marks.Marks[0])
-	ext := filepath.Ext(link)
+	ext := filepath.Ext(txt.Marks.Marks[0].Param)
+	link := m.getOriginalName(txt.Marks.Marks[0].Param, importSource)
 	if file := files[link]; file != nil {
 		if strings.EqualFold(ext, ".csv") {
+			txt.Marks.Marks[0].Param = link
 			m.processCSVFileLink(block, files, link, wholeLineLink)
 			return
 		}
 		if strings.EqualFold(ext, ".md") {
 			// only convert if this is the only link in the row
+			txt.Marks.Marks[0].Param = link
 			m.convertToAnytypeLinkBlock(block, wholeLineLink)
 		} else {
 			block.Content = anymark.ConvertTextToFile(txt.Marks.Marks[0].Param)
@ -129,31 +131,33 @@ func (m *mdConverter) handleSingleMark(block *model.Block, files map[string]*Fil
 	}
 }

-func (m *mdConverter) handleMultipleMarks(block *model.Block, files map[string]*FileInfo) {
+func (m *mdConverter) handleMultipleMarks(block *model.Block, files map[string]*FileInfo, importSource source.Source) {
 	txt := block.GetText()
 	for _, mark := range txt.Marks.Marks {
 		if mark.Type == model.BlockContentTextMark_Link {
-			if stop := m.handleSingleLinkMark(block, files, mark, txt); stop {
+			if stop := m.handleSingleLinkMark(block, files, mark, txt, importSource); stop {
 				return
 			}
 		}
 	}
 }

-func (m *mdConverter) handleSingleLinkMark(block *model.Block, files map[string]*FileInfo, mark *model.BlockContentTextMark, txt *model.BlockContentText) bool {
-	link := mark.Param
+func (m *mdConverter) handleSingleLinkMark(block *model.Block, files map[string]*FileInfo, mark *model.BlockContentTextMark, txt *model.BlockContentText, importSource source.Source) bool {
+	isWholeLink := m.isWholeLineLink(txt.Text, mark)
+	link := m.getOriginalName(mark.Param, importSource)
 	ext := filepath.Ext(link)
 	if file := files[link]; file != nil {
 		file.HasInboundLinks = true
 		if strings.EqualFold(ext, ".md") || strings.EqualFold(ext, ".csv") {
 			mark.Type = model.BlockContentTextMark_Mention
+			mark.Param = link
 			return false
 		}
-		if m.isWholeLineLink(txt.Text, mark) {
+		if isWholeLink {
 			block.Content = anymark.ConvertTextToFile(mark.Param)
 			return true
 		}
-	} else if m.isWholeLineLink(txt.Text, mark) {
+	} else if isWholeLink {
 		m.convertTextToBookmark(mark.Param, block)
 		return true
 	}
@ -275,3 +279,10 @@ func (m *mdConverter) createBlocksFromFile(importSource source.Source, filePath
 	}
 	return nil
 }
+
+func (m *mdConverter) getOriginalName(link string, importSource source.Source) string {
+	if originalFileNameGetter, ok := importSource.(source.OriginalFileNameGetter); ok {
+		return originalFileNameGetter.GetFileOriginalName(link)
+	}
+	return link
+}
--- a/core/block/import/markdown/blockconverter_test.go
+++ b/core/block/import/markdown/blockconverter_test.go
@ -42,7 +42,7 @@ func Test_processFiles(t *testing.T) {
 		files := converter.processFiles(absolutePath, common.NewError(pb.RpcObjectImportRequest_IGNORE_ERRORS), source)

 		// then
-		assert.Len(t, files, 6)
+		assert.Len(t, files, 9)

 		pdfFilePath := filepath.Join(absolutePath, "test.pdf")
 		assert.Contains(t, files, pdfFilePath)
@ -74,7 +74,7 @@ func Test_processFiles(t *testing.T) {
 		files := converter.processFiles(absolutePath, common.NewError(pb.RpcObjectImportRequest_IGNORE_ERRORS), source)

 		// then
-		assert.Len(t, files, 4)
+		assert.Len(t, files, 7)

 		pdfFilePath := filepath.Join(absolutePath, "test.pdf")
 		assert.NotContains(t, files, pdfFilePath)
--- a/core/block/import/markdown/import_test.go
+++ b/core/block/import/markdown/import_test.go
@ -1,6 +1,7 @@
 package markdown

 import (
+	"archive/zip"
 	"context"
 	"errors"
 	"os"
@ -97,7 +98,7 @@ func TestMarkdown_GetSnapshots(t *testing.T) {
 		// then
 		assert.Nil(t, err)
 		assert.NotNil(t, sn)
-		assert.Len(t, sn.Snapshots, 4)
+		assert.Len(t, sn.Snapshots, 7)

 		fileNameToObjectId := make(map[string]string, len(sn.Snapshots))
 		for _, snapshot := range sn.Snapshots {
@ -119,8 +120,7 @@ func TestMarkdown_GetSnapshots(t *testing.T) {
 		// given
 		testDirectory := t.TempDir()
 		zipPath := filepath.Join(testDirectory, "empty.zip")
-		err := test.CreateEmptyZip(t, zipPath)
-		assert.Nil(t, err)
+		test.CreateEmptyZip(t, zipPath)

 		h := &Markdown{}
 		p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})
@ -139,6 +139,132 @@ func TestMarkdown_GetSnapshots(t *testing.T) {
 		assert.Nil(t, sn)
 		assert.True(t, errors.Is(ce.GetResultError(model.Import_Markdown), common.ErrFileImportNoObjectsInZipArchive))
 	})
+	t.Run("import non utf files", func(t *testing.T) {
+		// given
+		testDirectory := t.TempDir()
+		zipPath := filepath.Join(testDirectory, "nonutf.zip")
+		fileMdName := "こんにちは.md"
+		fileCsvName := "你好.csv"
+		fileWithLinksName := "nonutflinks.md"
+
+		test.CreateZipWithFiles(t, zipPath, "testdata", []*zip.FileHeader{
+			{
+				Name:   fileWithLinksName,
+				Method: zip.Deflate,
+			},
+			{
+				Name:    fileMdName,
+				Method:  zip.Deflate,
+				NonUTF8: true,
+			},
+			{
+				Name:    fileCsvName,
+				Method:  zip.Deflate,
+				NonUTF8: true,
+			},
+		})
+
+		h := &Markdown{}
+		p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})
+
+		// when
+		sn, ce := h.GetSnapshots(context.Background(), &pb.RpcObjectImportRequest{
+			Params: &pb.RpcObjectImportRequestParamsOfMarkdownParams{
+				MarkdownParams: &pb.RpcObjectImportRequestMarkdownParams{Path: []string{zipPath}},
+			},
+			Type: model.Import_Markdown,
+			Mode: pb.RpcObjectImportRequest_IGNORE_ERRORS,
+		}, p)
+
+		// then
+		assert.Nil(t, ce)
+		assert.NotNil(t, sn)
+		assert.Len(t, sn.Snapshots, 4)
+		fileNameToObjectId := make(map[string]string, len(sn.Snapshots))
+		for _, snapshot := range sn.Snapshots {
+			fileNameToObjectId[snapshot.FileName] = snapshot.Id
+		}
+		var found bool
+		rootId := fileNameToObjectId[fileWithLinksName]
+		want := buildTreeWithNonUtfLinks(fileNameToObjectId, rootId)
+		for _, snapshot := range sn.Snapshots {
+			if snapshot.FileName == fileWithLinksName {
+				found = true
+				blockbuilder.AssertTreesEqual(t, want.Build(), snapshot.Snapshot.Data.Blocks)
+			}
+		}
+		assert.True(t, found)
+	})
+}
+
+func buildTreeWithNonUtfLinks(fileNameToObjectId map[string]string, rootId string) *blockbuilder.Block {
+	testMdPath := fileNameToObjectId["import file 2.md"]
+	testCsvPath := fileNameToObjectId["import file 3.csv"]
+
+	want := blockbuilder.Root(
+		blockbuilder.ID(rootId),
+		blockbuilder.Children(
+			blockbuilder.Text("NonUtf 1 test6", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
+				{
+					Range: &model.Range{From: 9, To: 14},
+					Type:  model.BlockContentTextMark_Mention,
+					Param: testMdPath,
+				},
+			}})),
+			blockbuilder.Text("NonUtf 2 test7", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
+				{
+					Range: &model.Range{From: 9, To: 14},
+					Type:  model.BlockContentTextMark_Mention,
+					Param: testCsvPath,
+				},
+			}})),
+			blockbuilder.Text("NonUtf 1 test6", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
+				{
+					Range: &model.Range{From: 9, To: 14},
+					Type:  model.BlockContentTextMark_Mention,
+					Param: testMdPath,
+				},
+				{
+					Range: &model.Range{From: 9, To: 14},
+					Type:  model.BlockContentTextMark_Bold,
+				},
+			}})),
+			blockbuilder.Text("NonUtf 2 test7", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
+				{
+					Range: &model.Range{From: 9, To: 14},
+					Type:  model.BlockContentTextMark_Mention,
+					Param: testCsvPath,
+				},
+				{
+					Range: &model.Range{From: 9, To: 14},
+					Type:  model.BlockContentTextMark_Bold,
+				},
+			}})),
+			blockbuilder.Text("test6", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
+				{
+					Range: &model.Range{From: 0, To: 5},
+					Type:  model.BlockContentTextMark_Mention,
+					Param: testMdPath,
+				},
+				{
+					Range: &model.Range{From: 0, To: 5},
+					Type:  model.BlockContentTextMark_Bold,
+				},
+			}})),
+			blockbuilder.Text("test7", blockbuilder.TextMarks(model.BlockContentTextMarks{Marks: []*model.BlockContentTextMark{
+				{
+					Range: &model.Range{From: 0, To: 5},
+					Type:  model.BlockContentTextMark_Mention,
+					Param: testCsvPath,
+				},
+				{
+					Range: &model.Range{From: 0, To: 5},
+					Type:  model.BlockContentTextMark_Bold,
+				},
+			}})),
+			blockbuilder.Link(rootId),
+		))
+	return want
 }

 func buildExpectedTree(fileNameToObjectId map[string]string, provider *MockTempDir, rootId string) *blockbuilder.Block {
--- a/core/block/import/markdown/testdata/nonutflinks.md
+++ b/core/block/import/markdown/testdata/nonutflinks.md
@ -0,0 +1,11 @@
+NonUtf 1 [test6](こんにちは.md)
+
+NonUtf 2 [test7](你好.csv)
+
+NonUtf 1 **[test6](こんにちは.md)**
+
+NonUtf 2 **[test7](你好.csv)**
+
+**[test6](こんにちは.md)**
+
+**[test7](你好.csv)**
--- a/core/block/import/markdown/testdata/こんにちは.md
+++ b/core/block/import/markdown/testdata/こんにちは.md
--- a/core/block/import/markdown/testdata/你好.csv
+++ b/core/block/import/markdown/testdata/你好.csv
--- a/core/block/import/pb/converter_test.go
+++ b/core/block/import/pb/converter_test.go
@ -243,8 +243,7 @@ func TestPb_GetSnapshots(t *testing.T) {
 		dir := t.TempDir()
 		p := &Pb{}
 		zipPath := filepath.Join(dir, "empty.zip")
-		err := test.CreateEmptyZip(t, zipPath)
-		assert.Nil(t, err)
+		test.CreateEmptyZip(t, zipPath)

 		// when
 		_, ce := p.GetSnapshots(context.Background(), &pb.RpcObjectImportRequest{
--- a/core/block/import/txt/converter_test.go
+++ b/core/block/import/txt/converter_test.go
@ -78,8 +78,7 @@ func TestTXT_GetSnapshots(t *testing.T) {
 		// given
 		dir := t.TempDir()
 		zipPath := filepath.Join(dir, "empty.zip")
-		err := test.CreateEmptyZip(t, zipPath)
-		assert.Nil(t, err)
+		test.CreateEmptyZip(t, zipPath)

 		h := &TXT{}
 		p := process.NewProgress(&pb.ModelProcessMessageOfImport{Import: &pb.ModelProcessImport{}})