1
0
Fork 0
mirror of https://github.com/anyproto/anytype-heart.git synced 2025-06-11 02:13:41 +09:00

GO-4753 rework file names detection when downloading files

This commit is contained in:
Roman Khafizianov 2024-12-18 13:41:43 +01:00
parent 9afac115e4
commit 1f79f955aa
No known key found for this signature in database
GPG key ID: F07A7D55A2684852
5 changed files with 243 additions and 56 deletions

View file

@ -2,8 +2,10 @@ package uri
import (
"fmt"
"mime"
"net/url"
"os"
"path/filepath"
"regexp"
"strings"
)
@ -84,3 +86,68 @@ func NormalizeAndParseURI(uri string) (*url.URL, error) {
return url.Parse(normalizeURI(uri))
}
var preferredExtensions = map[string]string{
"image/jpeg": ".jpeg",
"audio/mpeg": ".mp3",
// Add more preferred mappings if needed
}
func GetFileNameFromURLAndContentType(u *url.URL, contentType string) string {
var host string
if u != nil {
lastSegment := filepath.Base(u.Path)
// Determine if this looks like a real filename. We'll say it's real if it has a dot or is a hidden file starting with a dot.
if lastSegment == "." || lastSegment == "" || (!strings.HasPrefix(lastSegment, ".") && !strings.Contains(lastSegment, ".")) {
// Not a valid filename
lastSegment = ""
}
if lastSegment != "" {
// A plausible filename was found directly in the URL
return lastSegment
}
// No filename, fallback to host-based
host = strings.TrimPrefix(u.Hostname(), "www.")
host = strings.ReplaceAll(host, ".", "_")
if host == "" {
host = "file"
}
}
// Try to get a preferred extension for the content type
var ext string
if preferred, ok := preferredExtensions[contentType]; ok {
ext = preferred
} else {
extensions, err := mime.ExtensionsByType(contentType)
if err != nil || len(extensions) == 0 {
// Fallback if no known extension
extensions = []string{".bin"}
}
ext = extensions[0]
}
// Determine a base name from content type
base := "file"
if strings.HasPrefix(contentType, "image/") {
base = "image"
} else if strings.HasPrefix(contentType, "audio/") {
base = "audio"
} else if strings.HasPrefix(contentType, "video/") {
base = "video"
}
var res strings.Builder
if host != "" {
res.WriteString(host)
res.WriteString("_")
}
res.WriteString(base)
if ext != "" {
res.WriteString(ext)
}
return res.String()
}

View file

@ -1,6 +1,7 @@
package uri
import (
"net/url"
"testing"
"github.com/stretchr/testify/assert"
@ -118,3 +119,114 @@ func TestURI_ValidateURI(t *testing.T) {
assert.NoError(t, err)
})
}
func TestGetFileNameFromURLWithContentTypeAndMime(t *testing.T) {
mustParseURL := func(s string) *url.URL {
u, err := url.Parse(s)
if err != nil {
t.Fatalf("url.Parse(%q) failed: %v", s, err)
}
return u
}
tests := []struct {
name string
url *url.URL
contentType string
expected string
}{
{
name: "URL with explicit filename and extension",
url: mustParseURL("https://example.com/image.jpg"),
contentType: "image/jpeg",
expected: "image.jpg",
},
{
name: "URL with explicit filename and extension, but wrong content type",
url: mustParseURL("https://example.com/image.jpg"),
contentType: "image/png",
expected: "image.jpg",
},
{
name: "URL with explicit filename and extension, and empty content type",
url: mustParseURL("https://example.com/image.jpg"),
contentType: "",
expected: "image.jpg",
},
{
name: "URL with query and fragment, explicit filename",
url: mustParseURL("https://example.com/file.jpeg?query=1#111"),
contentType: "image/jpeg",
expected: "file.jpeg",
},
{
name: "No filename in URL, fallback to host and image/jpeg",
url: mustParseURL("https://www.example.com/path/to/"),
contentType: "image/jpeg",
// host -> example_com
// image/jpeg typically corresponds to .jpeg or .jpg (mime usually returns .jpeg)
expected: "example_com_image.jpeg",
},
{
name: "Host-only URL, fallback with image/png",
url: mustParseURL("https://www.example.com"),
contentType: "image/png",
expected: "example_com_image.png",
},
{
name: "Filename present with video/mp4",
url: mustParseURL("https://www.sub.example.co.uk/folder/video.mp4"),
contentType: "video/mp4",
expected: "video.mp4",
},
{
name: "No extension but filename present",
url: mustParseURL("https://example.com/filename"),
contentType: "image/gif",
expected: "example_com_image.gif",
},
{
name: "Invalid URL returns empty",
url: nil,
contentType: "image/jpeg",
expected: "image.jpeg",
},
{
name: "No filename, video/unknown fallback to .bin",
url: mustParseURL("https://www.subdomain.example.com/folder/"),
contentType: "video/unknown",
// no known extension for "video/unknown", fallback .bin
expected: "subdomain_example_com_video.bin",
},
{
name: "Hidden file as filename",
url: mustParseURL("https://example.com/.htaccess"),
contentType: "text/plain",
expected: ".htaccess",
},
{
name: "URL with query but no filename extension, fallback audio/mpeg",
url: mustParseURL("https://example.com/path?version=2"),
contentType: "audio/mpeg",
// audio/mpeg known extension: .mp3
expected: "example_com_audio.mp3",
},
{
name: "Unknown type entirely",
url: mustParseURL("https://example.net/"),
contentType: "application/x-something-strange",
// no filename, fallback host: example_net
// unknown type -> .bin
expected: "example_net_file.bin",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := GetFileNameFromURLAndContentType(tt.url, tt.contentType)
if got != tt.expected {
t.Errorf("GetFileNameFromURL(%q, %q) = %q; want %q", tt.url, tt.contentType, got, tt.expected)
}
})
}
}