1
0
Fork 0
mirror of https://github.com/anyproto/anytype-heart.git synced 2025-06-09 17:44:59 +09:00
anytype-heart/util/linkpreview/linkpreview.go
2020-08-09 23:39:11 +03:00

157 lines
3.6 KiB
Go

package linkpreview
import (
"context"
"io"
"net/http"
"net/url"
"path/filepath"
"strings"
"unicode/utf8"
"github.com/anytypeio/go-anytype-middleware/util/uri"
"github.com/anytypeio/go-anytype-library/pb/model"
"github.com/mauidude/go-readability"
"github.com/microcosm-cc/bluemonday"
"github.com/otiai10/opengraph"
)
func New() LinkPreview {
return &linkPreview{bmPolicy: bluemonday.NewPolicy().AddSpaceWhenStrippingTag(true)}
}
const (
// read no more than 400 kb
maxBytesToRead = 400000
maxDescriptionSize = 200
)
type LinkPreview interface {
Fetch(ctx context.Context, url string) (model.LinkPreview, error)
}
type linkPreview struct {
bmPolicy *bluemonday.Policy
}
func (l *linkPreview) Fetch(ctx context.Context, fetchUrl string) (model.LinkPreview, error) {
rt := &proxyRoundTripper{RoundTripper: http.DefaultTransport}
client := &http.Client{Transport: rt}
og, err := opengraph.FetchWithContext(ctx, fetchUrl, client)
if err != nil {
if resp := rt.lastResponse; resp != nil && resp.StatusCode == http.StatusOK {
return l.makeNonHtml(fetchUrl, resp)
}
return model.LinkPreview{}, err
}
res := l.convertOGToInfo(og)
if len(res.Description) == 0 {
res.Description = l.findContent(rt.lastBody)
}
if !utf8.ValidString(res.Title) {
res.Title = ""
}
if !utf8.ValidString(res.Description) {
res.Description = ""
}
return res, nil
}
func (l *linkPreview) convertOGToInfo(og *opengraph.OpenGraph) (i model.LinkPreview) {
og.ToAbsURL()
i = model.LinkPreview{
Url: og.URL.String(),
Title: og.Title,
Description: og.Description,
Type: model.LinkPreview_Page,
FaviconUrl: og.Favicon,
}
if len(i.FaviconUrl) == 0 {
og.URL.Path = "favicon.ico"
og.URL.RawQuery = ""
i.FaviconUrl = og.URL.String()
}
if len(og.Image) != 0 {
url, err := uri.ProcessURI(og.Image[0].URL)
if err == nil {
i.ImageUrl = url
}
}
return
}
func (l *linkPreview) findContent(data []byte) (content string) {
defer func() {
if e := recover(); e != nil {
// ignore possible panic while html parsing
}
}()
doc, err := readability.NewDocument(string(data))
if err != nil {
return
}
content = doc.Content()
content = strings.TrimSpace(l.bmPolicy.Sanitize(content))
content = strings.Join(strings.Fields(content), " ") // removes repetitive whitespaces
if utf8.RuneCountInString(content) > maxDescriptionSize {
content = string([]rune(content)[:maxDescriptionSize]) + "..."
}
return
}
func (l *linkPreview) makeNonHtml(fetchUrl string, resp *http.Response) (i model.LinkPreview, err error) {
ct := resp.Header.Get("Content-Type")
i.Url = fetchUrl
i.Title = filepath.Base(fetchUrl)
if strings.HasPrefix(ct, "image/") {
i.Type = model.LinkPreview_Image
i.ImageUrl = fetchUrl
} else if strings.HasPrefix(ct, "text/") {
i.Type = model.LinkPreview_Text
} else {
i.Type = model.LinkPreview_Unknown
}
pUrl, e := url.Parse(fetchUrl)
if e == nil {
pUrl.Path = "favicon.ico"
pUrl.RawQuery = ""
i.FaviconUrl = pUrl.String()
}
return
}
type proxyRoundTripper struct {
http.RoundTripper
lastResponse *http.Response
lastBody []byte
}
func (p *proxyRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
resp, err := p.RoundTripper.RoundTrip(req)
if err == nil {
p.lastResponse = resp
resp.Body = &limitReader{ReadCloser: resp.Body, rt: p}
}
return resp, err
}
type limitReader struct {
rt *proxyRoundTripper
nTotal int
io.ReadCloser
}
func (l *limitReader) Read(p []byte) (n int, err error) {
if l.nTotal > maxBytesToRead {
return 0, io.EOF
}
n, err = l.ReadCloser.Read(p)
if err == nil || err == io.EOF {
l.rt.lastBody = append(l.rt.lastBody, p[:n]...)
}
l.nTotal += n
return
}