1
0
Fork 0
mirror of https://github.com/anyproto/anytype-heart.git synced 2025-06-11 18:20:33 +09:00

merge linkpreview

This commit is contained in:
Sergey Cherepanov 2020-02-13 14:41:35 +03:00
commit ba1a063e86
No known key found for this signature in database
GPG key ID: 085319C64294F576
5 changed files with 264 additions and 2 deletions

5
go.mod
View file

@ -4,7 +4,7 @@ go 1.13
require (
github.com/PuerkitoBio/goquery v1.5.0 // indirect
github.com/anytypeio/go-anytype-library v0.0.0-20200207123657-05526fc73774
github.com/anytypeio/go-anytype-library v0.0.0-20200213113659-8cc266f8bcea
github.com/anytypeio/html-to-markdown v0.0.0-20200123120722-1c256e006f13
github.com/gogo/protobuf v1.3.1
@ -12,7 +12,10 @@ require (
github.com/google/uuid v1.1.1
github.com/ipfs/go-log v0.0.1
github.com/lunny/html2md v0.0.0-20181018071239-7d234de44546
github.com/mauidude/go-readability v0.0.0-20141216012317-2f30b1a346f1
github.com/microcosm-cc/bluemonday v1.0.2
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826
github.com/otiai10/opengraph v1.1.0
github.com/stretchr/testify v1.4.0
github.com/textileio/go-textile v0.7.8-0.20200102164400-98b263e32c0c
github.com/yosssi/gohtml v0.0.0-20190915184251-7ff6f235ecaf

16
go.sum
View file

@ -29,6 +29,8 @@ github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRy
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/anytypeio/go-anytype-library v0.0.0-20200207123657-05526fc73774 h1:/QdI8TX2boY3SaAZCD5+BF5TsR2gdsmuVXRBXKFhz9E=
github.com/anytypeio/go-anytype-library v0.0.0-20200207123657-05526fc73774/go.mod h1:GlKZUsdpJePFbhZWJfMZhXpOW0C9CpPVMaPqo3L/09U=
github.com/anytypeio/go-anytype-library v0.0.0-20200213113659-8cc266f8bcea h1:HSTwvg9Jzn9ndEIvrSt2l5r3u7sgojC8Q4k+bKdaFpM=
github.com/anytypeio/go-anytype-library v0.0.0-20200213113659-8cc266f8bcea/go.mod h1:GlKZUsdpJePFbhZWJfMZhXpOW0C9CpPVMaPqo3L/09U=
github.com/anytypeio/go-textile v0.7.8-0.20200205164649-a38fa26a4621 h1:WKIoqcvNon5cU8ZxwzIESR34+NG6Lt8horyQ6cJhREQ=
github.com/anytypeio/go-textile v0.7.8-0.20200205164649-a38fa26a4621/go.mod h1:GpthA3sPUte8bRAATVaBj0rzWtxhv+8k6Js2jiMYIlw=
github.com/anytypeio/html-to-markdown v0.0.0-20200123120722-1c256e006f13 h1:XMUxybyqCaEAGZZK/Qv2cRB0klc04dNTgtSqpacg15Y=
@ -70,6 +72,7 @@ github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmf
github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cskr/pubsub v1.0.2 h1:vlOzMhl6PFn60gRlTQQsIfVwaPB/B/8MziK8FhEPt/0=
github.com/cskr/pubsub v1.0.2/go.mod h1:/8MzYXk/NJAz782G8RPkFzXTZVu63VotefPnR9TIRis=
github.com/davecgh/go-spew v0.0.0-20171005155431-ecdeabc65495/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -564,7 +567,11 @@ github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE
github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mauidude/go-readability v0.0.0-20141216012317-2f30b1a346f1 h1:LUX7+Xw9WqBYU1KIhBeHhE9IEziRmfE6QL/KOJw27XY=
github.com/mauidude/go-readability v0.0.0-20141216012317-2f30b1a346f1/go.mod h1:JunVAey+5bzS6oFzfAEXL15REaSvzaDhB+muny1oSNU=
github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE=
github.com/microcosm-cc/bluemonday v1.0.2 h1:5lPfLTTAvAbtS0VqT+94yOtFnGfUWYyx0+iToC3Os3s=
github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
github.com/miekg/dns v1.1.4/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
github.com/miekg/dns v1.1.12 h1:WMhc1ik4LNkTg8U9l3hI1LvxKmIL+f1+WV/SZtCbDDA=
github.com/miekg/dns v1.1.12/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
@ -642,6 +649,11 @@ github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1Cpa
github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsqf19k25Ur8rU=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE=
github.com/otiai10/marmoset v0.4.0/go.mod h1:t2q6dXWZ9YcFdRREDApX4bCmfQnL3isJ2dgl8ychlXg=
github.com/otiai10/mint v1.3.0/go.mod h1:F5AjcsTsWUqX+Na9fpHb52P8pcRX2CI6A3ctIT91xUo=
github.com/otiai10/opengraph v1.1.0 h1:7CLPM41/VNVOh6V04K9Ccui2Fn6p+NEZ06krlRxV7HQ=
github.com/otiai10/opengraph v1.1.0/go.mod h1:ZMbPcfiSRSsg3+yrWZCXrgYL6kEK4KpH4GG1iyIvEXs=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
@ -668,12 +680,14 @@ github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDa
github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik=
github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd h1:CmH9+J6ZSsIjUK3dcGsnCnO41eRBOnY12zwkn5qVwgc=
github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk=
github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/segmentio/ksuid v1.0.2 h1:9yBfKyw4ECGTdALaF09Snw3sLJmYIX6AbPJrAy6MrDc=
github.com/segmentio/ksuid v1.0.2/go.mod h1:BXuJDr2byAiHuQaQtSKoXh1J0YmUDurywOXgB2w+OSU=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.2.0 h1:juTguoYk5qI21pwyTXY3B3Y5cOTH3ZUyZCg1v/mihuo=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
@ -722,6 +736,7 @@ github.com/ugorji/go/codec v0.0.0-20181209151446-772ced7fd4c2/go.mod h1:VFNgLljT
github.com/ugorji/go/codec v1.1.5-pre h1:5YV9PsFAN+ndcCtTM7s60no7nY7eTG3LPtxhSwuxzCs=
github.com/ugorji/go/codec v1.1.5-pre/go.mod h1:tULtS6Gy1AE1yCENaw4Vb//HLH5njI2tfCQDUqRd8fI=
github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
github.com/warpfork/go-wish v0.0.0-20180510122957-5ad1f5abf436/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw=
github.com/warpfork/go-wish v0.0.0-20190328234359-8b3e70f8e830 h1:8kxMKmKzXXL4Ru1nyhvdms/JjWt+3YLpvRb/bAjO/y0=
github.com/warpfork/go-wish v0.0.0-20190328234359-8b3e70f8e830/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw=
@ -823,6 +838,7 @@ golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR
golang.org/x/net v0.0.0-20190611141213-3f473d35a33a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582 h1:p9xBe/w/OzkeYVKm234g55gMdD1nSIooTir5kV11kfA=
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=

View file

@ -14,7 +14,7 @@ import (
var log = logging.Logger("anytype-mw")
var mw = &core.Middleware{}
var mw = core.NewMiddleware()
func init() {
registerClientCommandsHandler(mw)

View file

@ -0,0 +1,132 @@
package linkpreview
import (
"context"
"io"
"net/http"
"path/filepath"
"strings"
"unicode/utf8"
"github.com/anytypeio/go-anytype-library/pb/model"
"github.com/mauidude/go-readability"
"github.com/microcosm-cc/bluemonday"
"github.com/otiai10/opengraph"
)
func New() LinkPreview {
return &linkPreview{bmPolicy: bluemonday.NewPolicy().AddSpaceWhenStrippingTag(true)}
}
const (
// read no more than 400 kb
maxBytesToRead = 400000
maxDescriptionSize = 200
)
type LinkPreview interface {
Fetch(ctx context.Context, url string) (model.ModelLinkPreview, error)
}
type linkPreview struct {
bmPolicy *bluemonday.Policy
}
func (l *linkPreview) Fetch(ctx context.Context, url string) (model.ModelLinkPreview, error) {
rt := &proxyRoundTripper{RoundTripper: http.DefaultTransport}
client := &http.Client{Transport: rt}
og, err := opengraph.FetchWithContext(ctx, url, client)
if err != nil {
if resp := rt.lastResponse; resp != nil && resp.StatusCode == http.StatusOK {
return l.makeNonHtml(url, resp)
}
return model.ModelLinkPreview{}, err
}
res := l.convertOGToInfo(og)
if len(res.Description) == 0 {
res.Description = l.findContent(rt.lastBody)
}
return res, nil
}
func (l *linkPreview) convertOGToInfo(og *opengraph.OpenGraph) (i model.ModelLinkPreview) {
og.ToAbsURL()
i = model.ModelLinkPreview{
Url: og.URL.String(),
Title: og.Title,
Description: og.Description,
Type: model.ModelLinkPreview_Page,
FaviconUrl: og.Favicon,
}
if len(og.Image) != 0 {
i.ImageUrl = og.Image[0].URL
}
return
}
func (l *linkPreview) findContent(data []byte) (content string) {
defer func() {
if e := recover(); e != nil {
// ignore possible panic while html parsing
}
}()
doc, err := readability.NewDocument(string(data))
if err != nil {
return
}
content = doc.Content()
content = strings.TrimSpace(l.bmPolicy.Sanitize(content))
content = strings.Join(strings.Fields(content), " ") // removes repetitive whitespaces
if utf8.RuneCountInString(content) > maxDescriptionSize {
content = string([]rune(content)[:maxDescriptionSize]) + "..."
}
return
}
func (l *linkPreview) makeNonHtml(url string, resp *http.Response) (i model.ModelLinkPreview, err error) {
ct := resp.Header.Get("Content-Type")
i.Url = url
i.Title = filepath.Base(url)
if strings.HasPrefix(ct, "image/") {
i.Type = model.ModelLinkPreview_Image
i.ImageUrl = url
} else if strings.HasPrefix(ct, "text/") {
i.Type = model.ModelLinkPreview_Text
} else {
i.Type = model.ModelLinkPreview_Unknown
}
return
}
type proxyRoundTripper struct {
http.RoundTripper
lastResponse *http.Response
lastBody []byte
}
func (p *proxyRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
resp, err := p.RoundTripper.RoundTrip(req)
if err == nil {
p.lastResponse = resp
resp.Body = &limitReader{ReadCloser: resp.Body, rt: p}
}
return resp, err
}
type limitReader struct {
rt *proxyRoundTripper
nTotal int
io.ReadCloser
}
func (l *limitReader) Read(p []byte) (n int, err error) {
if l.nTotal > maxBytesToRead {
return 0, io.EOF
}
n, err = l.ReadCloser.Read(p)
if err == nil || err == io.EOF {
l.rt.lastBody = append(l.rt.lastBody, p[:n]...)
}
l.nTotal += n
return
}

View file

@ -0,0 +1,111 @@
package linkpreview
import (
"context"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"github.com/anytypeio/go-anytype-library/pb/model"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var ctx = context.Background()
func TestLinkPreview_Fetch(t *testing.T) {
t.Run("html page", func(t *testing.T) {
ts := newTestServer("text/html", strings.NewReader(tetsHtml))
defer ts.Close()
lp := New()
info, err := lp.Fetch(ctx, ts.URL)
require.NoError(t, err)
assert.Equal(t, model.ModelLinkPreview{
Url: ts.URL,
FaviconUrl: ts.URL + "/favicon.ico",
Title: "Title",
Description: "Description",
ImageUrl: "http://site.com/images/example.jpg",
Type: model.ModelLinkPreview_Page,
}, info)
})
t.Run("html page and find description", func(t *testing.T) {
ts := newTestServer("text/html", strings.NewReader(tetsHtmlWithoutDescription))
defer ts.Close()
lp := New()
info, err := lp.Fetch(ctx, ts.URL)
require.NoError(t, err)
assert.Equal(t, model.ModelLinkPreview{
Url: ts.URL,
FaviconUrl: ts.URL + "/favicon.ico",
Title: "Title",
Description: "Sed ut perspiciatis, unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam eaque ipsa, quae ab illo inventore veritatis et quasi architecto beatae vitae dicta...",
ImageUrl: "http://site.com/images/example.jpg",
Type: model.ModelLinkPreview_Page,
}, info)
})
t.Run("binary image", func(t *testing.T) {
tr := testReader(0)
ts := newTestServer("image/jpg", &tr)
defer ts.Close()
url := ts.URL + "/filename.jpg"
lp := New()
info, err := lp.Fetch(ctx, url)
require.NoError(t, err)
assert.Equal(t, model.ModelLinkPreview{
Url: url,
Title: "filename.jpg",
ImageUrl: url,
Type: model.ModelLinkPreview_Image,
}, info)
})
t.Run("binary", func(t *testing.T) {
tr := testReader(0)
ts := newTestServer("binary/octed-stream", &tr)
defer ts.Close()
url := ts.URL + "/filename.jpg"
lp := New()
info, err := lp.Fetch(ctx, url)
require.NoError(t, err)
assert.Equal(t, model.ModelLinkPreview{
Url: url,
Title: "filename.jpg",
Type: model.ModelLinkPreview_Unknown,
}, info)
})
}
func newTestServer(contentType string, data io.Reader) *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", contentType)
io.Copy(w, data)
}))
}
type testReader int
func (t *testReader) Read(p []byte) (n int, err error) {
*t += testReader(len(p))
return len(p), nil
}
const tetsHtml = `<html><head>
<title>Title</title>
<meta name="description" content="Description">
<meta property="og:image" content="http://site.com/images/example.jpg" />
</head></html>`
const tetsHtmlWithoutDescription = `<html><head>
<title>Title</title>
<meta property="og:image" content="http://site.com/images/example.jpg" />
</head><body><div id="content"">
<p>
Sed ut perspiciatis, unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam eaque ipsa, quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt, explicabo. Nemo enim ipsam voluptatem, quia voluptas sit, aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos, qui ratione voluptatem sequi nesciunt, neque porro quisquam est, qui do<b>lorem ipsum</b>, quia <b>dolor sit, amet, consectetur, adipisci</b> v<b>elit, sed</b> quia non numquam <b>eius mod</b>i <b>tempor</b>a <b>incidunt, ut labore et dolore magna</b>m <b>aliqua</b>m quaerat voluptatem. <b>Ut enim ad minim</b>a <b>veniam, quis nostru</b>m <b>exercitation</b>em <b>ullam co</b>rporis suscipit<b> labori</b>o<b>s</b>am, <b>nisi ut aliquid ex ea commod</b>i <b>consequat</b>ur? <b>Quis aute</b>m vel eum <b>iure reprehenderit,</b> qui <b>in</b> ea <b>voluptate velit esse</b>, quam nihil molestiae <b>c</b>onsequatur, vel <b>illum</b>, qui <b>dolore</b>m <b>eu</b>m <b>fugiat</b>, quo voluptas <b>nulla pariatur</b>? At vero eos et accusamus et iusto odio dignissimos ducimus, qui blanditiis praesentium voluptatum deleniti atque corrupti, quos dolores et quas molestias <b>exceptur</b>i <b>sint, obcaecat</b>i <b>cupiditat</b>e <b>non pro</b>v<b>ident</b>, similique <b>sunt in culpa</b>, <b>qui officia deserunt mollit</b>ia <b>anim</b>i, <b>id est laborum</b> et dolorum fuga. Et harum quidem rerum facilis est et expedita distinctio. Nam libero tempore, cum soluta nobis est eligendi optio, cumque nihil impedit, quo minus id, quod maxime placeat, facere possimus, omnis voluptas assumenda est, omnis dolor repellendus. Temporibus autem quibusdam et aut officiis debitis aut rerum necessitatibus saepe eveniet, ut et voluptates repudiandae sint et molestiae non recusandae. Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat.
</p></div></body></html>`