mirror of
https://github.com/0x2E/fusion.git
synced 2025-06-08 05:27:15 +09:00
Extract feedfinder (#154)
* refactor: extract feedfinder to a standalone package * fix: increase timeout for feed validation to improve reliability
This commit is contained in:
parent
b83b868fc7
commit
5f527b57a7
13 changed files with 8 additions and 747 deletions
|
@ -30,7 +30,7 @@ export type FeedRequestOptions = {
|
||||||
export async function checkValidity(link: string, options: FeedRequestOptions) {
|
export async function checkValidity(link: string, options: FeedRequestOptions) {
|
||||||
const resp = await api
|
const resp = await api
|
||||||
.post('feeds/validation', {
|
.post('feeds/validation', {
|
||||||
timeout: 10000,
|
timeout: 30000,
|
||||||
json: { link: link, request_options: options }
|
json: { link: link, request_options: options }
|
||||||
})
|
})
|
||||||
.json<{ feed_links: { title: string; link: string }[] }>();
|
.json<{ feed_links: { title: string; link: string }[] }>();
|
||||||
|
|
3
go.mod
3
go.mod
|
@ -3,7 +3,7 @@ module github.com/0x2e/fusion
|
||||||
go 1.24
|
go 1.24
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/PuerkitoBio/goquery v1.10.3
|
github.com/0x2E/feedfinder v0.0.3
|
||||||
github.com/caarlos0/env/v11 v11.3.1
|
github.com/caarlos0/env/v11 v11.3.1
|
||||||
github.com/glebarez/sqlite v1.11.0
|
github.com/glebarez/sqlite v1.11.0
|
||||||
github.com/go-playground/locales v0.14.1
|
github.com/go-playground/locales v0.14.1
|
||||||
|
@ -21,6 +21,7 @@ require (
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
github.com/PuerkitoBio/goquery v1.10.3 // indirect
|
||||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
|
|
2
go.sum
2
go.sum
|
@ -1,3 +1,5 @@
|
||||||
|
github.com/0x2E/feedfinder v0.0.3 h1:B5jXpsQPelV+YcCigqAm7AXlFYml3X8A+5ug8FJ02/4=
|
||||||
|
github.com/0x2E/feedfinder v0.0.3/go.mod h1:/+tl3hTDZetFvRLsphmRqn3081E8nVtGhp28mUOZtOk=
|
||||||
github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
|
github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
|
||||||
github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
|
github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
|
||||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||||
|
|
|
@ -1,132 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"net/http"
|
|
||||||
"net/url"
|
|
||||||
"sync"
|
|
||||||
)
|
|
||||||
|
|
||||||
type FeedLink struct {
|
|
||||||
Title string `json:"title"`
|
|
||||||
Link string `json:"link"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type Finder struct {
|
|
||||||
target *url.URL
|
|
||||||
httpClient *http.Client
|
|
||||||
}
|
|
||||||
|
|
||||||
type Options struct {
|
|
||||||
ReqProxy *string
|
|
||||||
}
|
|
||||||
|
|
||||||
func Find(ctx context.Context, target *url.URL, options Options) ([]FeedLink, error) {
|
|
||||||
clientTransportOps := []transportOptionFunc{}
|
|
||||||
if options.ReqProxy != nil && *options.ReqProxy != "" {
|
|
||||||
proxyURL, err := url.Parse(*options.ReqProxy)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
clientTransportOps = append(clientTransportOps, func(transport *http.Transport) {
|
|
||||||
transport.Proxy = http.ProxyURL(proxyURL)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
finder := Finder{
|
|
||||||
target: target,
|
|
||||||
httpClient: newClient(clientTransportOps...),
|
|
||||||
}
|
|
||||||
return finder.Run(context.Background())
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *Finder) Run(ctx context.Context) ([]FeedLink, error) {
|
|
||||||
// find in third-party service
|
|
||||||
logger := slog.With("step", "third-party service")
|
|
||||||
fromService, err := f.tryService(ctx)
|
|
||||||
if err != nil {
|
|
||||||
logger.Error(err.Error())
|
|
||||||
}
|
|
||||||
if len(fromService) != 0 {
|
|
||||||
return fromService, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
feedMap := make(map[string]FeedLink)
|
|
||||||
mu := sync.Mutex{}
|
|
||||||
wg := sync.WaitGroup{}
|
|
||||||
|
|
||||||
wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer wg.Done()
|
|
||||||
|
|
||||||
// sniff in HTML
|
|
||||||
logger := slog.With("step", "page")
|
|
||||||
data, err := f.tryPageSource(ctx)
|
|
||||||
if err != nil {
|
|
||||||
logger.Error(err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
mu.Lock()
|
|
||||||
for _, f := range data {
|
|
||||||
feedMap[f.Link] = f
|
|
||||||
}
|
|
||||||
mu.Unlock()
|
|
||||||
}()
|
|
||||||
|
|
||||||
wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer wg.Done()
|
|
||||||
|
|
||||||
// sniff well-knowns under this url
|
|
||||||
logger := logger.With("step", "well-knowns")
|
|
||||||
data, err := f.tryWellKnown(ctx, fmt.Sprintf("%s://%s%s", f.target.Scheme, f.target.Host, f.target.Path))
|
|
||||||
if err != nil {
|
|
||||||
logger.Error(err.Error())
|
|
||||||
}
|
|
||||||
if len(data) == 0 {
|
|
||||||
// sniff well-knowns under root path
|
|
||||||
data, err = f.tryWellKnown(ctx, fmt.Sprintf("%s://%s", f.target.Scheme, f.target.Host))
|
|
||||||
if err != nil {
|
|
||||||
logger.Error(err.Error())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mu.Lock()
|
|
||||||
for _, f := range data {
|
|
||||||
feedMap[f.Link] = f
|
|
||||||
}
|
|
||||||
mu.Unlock()
|
|
||||||
}()
|
|
||||||
|
|
||||||
wg.Wait()
|
|
||||||
res := make([]FeedLink, 0, len(feedMap))
|
|
||||||
for _, f := range feedMap {
|
|
||||||
res = append(res, f)
|
|
||||||
}
|
|
||||||
return res, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEmptyFeedLink(feed FeedLink) bool {
|
|
||||||
return feed == FeedLink{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func formatLinkToAbs(base, link string) string {
|
|
||||||
if link == "" {
|
|
||||||
return base
|
|
||||||
}
|
|
||||||
linkURL, err := url.Parse(link)
|
|
||||||
if err != nil {
|
|
||||||
return link
|
|
||||||
}
|
|
||||||
if linkURL.IsAbs() {
|
|
||||||
return link
|
|
||||||
}
|
|
||||||
|
|
||||||
baseURL, err := url.Parse(base)
|
|
||||||
if err != nil {
|
|
||||||
return link
|
|
||||||
}
|
|
||||||
return baseURL.ResolveReference(linkURL).String()
|
|
||||||
}
|
|
|
@ -1,27 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestFormatLinkToAbs(t *testing.T) {
|
|
||||||
type item struct {
|
|
||||||
base string
|
|
||||||
link string
|
|
||||||
want string
|
|
||||||
}
|
|
||||||
table := []item{
|
|
||||||
{base: "https://x.xx", link: "https://1.xx", want: "https://1.xx"},
|
|
||||||
{base: "https://x.xx", link: "", want: "https://x.xx"},
|
|
||||||
{base: "https://x.xx/1/", link: "/x/index.xml", want: "https://x.xx/x/index.xml"},
|
|
||||||
{base: "https://x.xx/1/", link: "x/index.xml", want: "https://x.xx/1/x/index.xml"},
|
|
||||||
{base: "https://x.xx/1", link: "index.xml", want: "https://x.xx/index.xml"},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range table {
|
|
||||||
res := formatLinkToAbs(tt.base, tt.link)
|
|
||||||
assert.Equal(t, tt.want, res)
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,107 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"log/slog"
|
|
||||||
"net/http"
|
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (f *Finder) tryPageSource(ctx context.Context) ([]FeedLink, error) {
|
|
||||||
resp, err := f.httpClient.Get(f.target.String())
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
content, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
return nil, fmt.Errorf("bad status %d", resp.StatusCode)
|
|
||||||
}
|
|
||||||
|
|
||||||
feeds, err := f.parseHTMLContent(ctx, content)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(err.Error(), "content_type", "HTML")
|
|
||||||
}
|
|
||||||
if len(feeds) != 0 {
|
|
||||||
for i := range feeds {
|
|
||||||
feed := &feeds[i]
|
|
||||||
feed.Link = formatLinkToAbs(f.target.String(), feed.Link)
|
|
||||||
}
|
|
||||||
return feeds, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
feed, err := parseRSSContent(content)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(err.Error(), "content_type", "RSS")
|
|
||||||
}
|
|
||||||
if !isEmptyFeedLink(feed) {
|
|
||||||
if feed.Link == "" {
|
|
||||||
feed.Link = f.target.String()
|
|
||||||
}
|
|
||||||
return []FeedLink{feed}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *Finder) parseHTMLContent(ctx context.Context, content []byte) ([]FeedLink, error) {
|
|
||||||
feeds := make([]FeedLink, 0)
|
|
||||||
|
|
||||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(content))
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
pageTitle := doc.FindMatcher(goquery.Single("title")).Text()
|
|
||||||
|
|
||||||
// find <link> type rss in <header>
|
|
||||||
linkExprs := []string{
|
|
||||||
"link[type='application/rss+xml']",
|
|
||||||
"link[type='application/atom+xml']",
|
|
||||||
"link[type='application/json']",
|
|
||||||
"link[type='application/feed+json']",
|
|
||||||
}
|
|
||||||
for _, expr := range linkExprs {
|
|
||||||
doc.Find("head").Find(expr).Each(func(_ int, s *goquery.Selection) {
|
|
||||||
feed := FeedLink{}
|
|
||||||
feed.Title, _ = s.Attr("title")
|
|
||||||
feed.Link, _ = s.Attr("href")
|
|
||||||
|
|
||||||
if feed.Title == "" {
|
|
||||||
feed.Title = pageTitle
|
|
||||||
}
|
|
||||||
feeds = append(feeds, feed)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// find <a> type rss in <body>
|
|
||||||
aExpr := "a:contains('rss')"
|
|
||||||
suspected := make(map[string]struct{})
|
|
||||||
doc.Find("body").Find(aExpr).Each(func(_ int, s *goquery.Selection) {
|
|
||||||
link, exists := s.Attr("href")
|
|
||||||
if !exists {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
suspected[link] = struct{}{}
|
|
||||||
})
|
|
||||||
for link := range suspected {
|
|
||||||
feed, err := f.parseRSSUrl(ctx, link)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !isEmptyFeedLink(feed) {
|
|
||||||
feed.Link = link // this may be more accurate than the link parsed from the rss content
|
|
||||||
feeds = append(feeds, feed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return feeds, nil
|
|
||||||
}
|
|
|
@ -1,70 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
)
|
|
||||||
|
|
||||||
type testParseHTMLContentItem struct {
|
|
||||||
content []byte
|
|
||||||
want []FeedLink
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseHTMLContentMatchLink(t *testing.T) {
|
|
||||||
table := []testParseHTMLContentItem{
|
|
||||||
{content: []byte(`
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>html title</title>
|
|
||||||
<link type="application/rss+xml" title="feed title" href="https://example.com/x/rss.xml">
|
|
||||||
<link type="application/atom+xml" href="https://example.com/x/atom.xml">
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<link type="application/feed+json" title="link in body" href="https://example.com/x/feed.json">
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
`), want: []FeedLink{
|
|
||||||
{Title: "feed title", Link: "https://example.com/x/rss.xml"},
|
|
||||||
{Title: "html title", Link: "https://example.com/x/atom.xml"},
|
|
||||||
}},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range table {
|
|
||||||
finder := Finder{}
|
|
||||||
feed, err := finder.parseHTMLContent(context.Background(), tt.content)
|
|
||||||
assert.Nil(t, err)
|
|
||||||
assert.ElementsMatch(t, tt.want, feed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseHTMLContentMatchLinkElement(t *testing.T) {
|
|
||||||
table := []testParseHTMLContentItem{
|
|
||||||
// match <a>
|
|
||||||
{content: []byte(`
|
|
||||||
<html>
|
|
||||||
<head><title>html title</title></head>
|
|
||||||
<body>
|
|
||||||
<p>xxx</p>
|
|
||||||
<main>
|
|
||||||
<p>xxx</p>
|
|
||||||
<a href="https://github.com/golang/go/releases.atom">RSS: Release notes from go</a>
|
|
||||||
</main>
|
|
||||||
<footer>
|
|
||||||
<a href="https://github.com/golang/go">wrong rss</a>
|
|
||||||
</footer>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
`), want: []FeedLink{
|
|
||||||
{Title: "Release notes from go", Link: "https://github.com/golang/go/releases.atom"},
|
|
||||||
}},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range table {
|
|
||||||
finder := Finder{httpClient: newClient()}
|
|
||||||
feed, err := finder.parseHTMLContent(context.Background(), tt.content)
|
|
||||||
assert.Nil(t, err)
|
|
||||||
assert.ElementsMatch(t, tt.want, feed)
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"net/http"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
type transportOptionFunc func(transport *http.Transport)
|
|
||||||
|
|
||||||
func newClient(options ...transportOptionFunc) *http.Client {
|
|
||||||
transport := http.DefaultTransport.(*http.Transport).Clone()
|
|
||||||
transport.DisableKeepAlives = true
|
|
||||||
transport.ForceAttemptHTTP2 = true
|
|
||||||
|
|
||||||
for _, optionFunc := range options {
|
|
||||||
optionFunc(transport)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &http.Client{
|
|
||||||
Transport: transport,
|
|
||||||
Timeout: 1 * time.Minute,
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,198 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
type serviceMatcher func(context.Context) ([]FeedLink, error)
|
|
||||||
|
|
||||||
func (f *Finder) tryService(ctx context.Context) ([]FeedLink, error) {
|
|
||||||
matcher := []serviceMatcher{
|
|
||||||
f.githubMatcher,
|
|
||||||
f.redditMatcher,
|
|
||||||
f.youtubeMatcher,
|
|
||||||
}
|
|
||||||
for _, fn := range matcher {
|
|
||||||
feed, err := fn(ctx)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if len(feed) != 0 {
|
|
||||||
return feed, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var githubGlobalFeed = []FeedLink{
|
|
||||||
{Title: "global public timeline", Link: "https://github.com/timeline"},
|
|
||||||
{Title: "global security advisories", Link: "https://github.com/security-advisories.atom"},
|
|
||||||
}
|
|
||||||
|
|
||||||
// https://docs.github.com/en/rest/activity/feeds?apiVersion=2022-11-28#get-feeds
|
|
||||||
func (s Finder) githubMatcher(ctx context.Context) ([]FeedLink, error) {
|
|
||||||
if !strings.HasSuffix(s.target.Hostname(), "github.com") {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
splited := strings.SplitN(s.target.Path, "/", 4) // split "/user/repo/" -> []string{"", "user", "repo", ""}
|
|
||||||
splitedLen := len(splited)
|
|
||||||
if splitedLen < 2 {
|
|
||||||
return githubGlobalFeed, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
username, reponame := "", ""
|
|
||||||
if splitedLen >= 2 {
|
|
||||||
username = splited[1]
|
|
||||||
}
|
|
||||||
if splitedLen >= 3 {
|
|
||||||
reponame = splited[2]
|
|
||||||
}
|
|
||||||
|
|
||||||
if reponame != "" {
|
|
||||||
re, err := regexp.Compile(`^[a-zA-Z0-9][a-zA-Z0-9-_\.]{0,98}[a-zA-Z0-9]$`) // todo need improve
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if !re.MatchString(reponame) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return genGitHubRepoFeed(username + "/" + reponame), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if username != "" {
|
|
||||||
re, err := regexp.Compile(`^[a-zA-Z0-9][-]?[a-zA-Z0-9]{0,38}$`)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if !re.MatchString(username) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return genGitHubUserFeed(username), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func genGitHubUserFeed(username string) []FeedLink {
|
|
||||||
return []FeedLink{{Title: username + " public timeline", Link: fmt.Sprintf("https://github.com/%s.atom", username)}}
|
|
||||||
}
|
|
||||||
|
|
||||||
func genGitHubRepoFeed(userRepo string) []FeedLink {
|
|
||||||
return []FeedLink{
|
|
||||||
{Title: fmt.Sprintf("%s commits", userRepo), Link: fmt.Sprintf("https://github.com/%s/commits.atom", userRepo)},
|
|
||||||
{Title: fmt.Sprintf("%s releases", userRepo), Link: fmt.Sprintf("https://github.com/%s/releases.atom", userRepo)},
|
|
||||||
{Title: fmt.Sprintf("%s tags", userRepo), Link: fmt.Sprintf("https://github.com/%s/tags.atom", userRepo)},
|
|
||||||
{Title: fmt.Sprintf("%s wiki", userRepo), Link: fmt.Sprintf("https://github.com/%s/wiki.atom", userRepo)},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var redditGlobalFeed = []FeedLink{
|
|
||||||
{Title: "global", Link: "https://www.reddit.com/.rss"},
|
|
||||||
}
|
|
||||||
|
|
||||||
// https://www.reddit.com/wiki/rss/
|
|
||||||
func (s Finder) redditMatcher(ctx context.Context) ([]FeedLink, error) {
|
|
||||||
if !strings.HasSuffix(s.target.Hostname(), "reddit.com") {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
splited := strings.SplitN(s.target.Path, "/", 4)
|
|
||||||
splitedLen := len(splited)
|
|
||||||
if splitedLen < 2 {
|
|
||||||
return redditGlobalFeed, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
mode, param := splited[1], splited[2]
|
|
||||||
switch mode {
|
|
||||||
case "r":
|
|
||||||
if splitedLen >= 4 && strings.HasPrefix(splited[3], "comments") {
|
|
||||||
// "comments/{postID}/{title}"
|
|
||||||
// "comments/{postID}/{title}/comment/{commentID}"
|
|
||||||
return genRedditCommentFeed(s.target.String()), nil
|
|
||||||
}
|
|
||||||
return genRedditSubFeed(param), nil
|
|
||||||
case "user":
|
|
||||||
return genRedditUserFeed(param), nil
|
|
||||||
case "domain":
|
|
||||||
return genRedditDomainSubmissionFeed(param), nil
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func genRedditSubFeed(sub string) []FeedLink {
|
|
||||||
return []FeedLink{
|
|
||||||
{Title: fmt.Sprintf("/r/%s hot", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/hot/.rss", sub)},
|
|
||||||
{Title: fmt.Sprintf("/r/%s new", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/new/.rss", sub)},
|
|
||||||
{Title: fmt.Sprintf("/r/%s top", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/top/.rss", sub)},
|
|
||||||
{Title: fmt.Sprintf("/r/%s rising", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/rising/.rss", sub)},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func genRedditCommentFeed(fullURL string) []FeedLink {
|
|
||||||
return []FeedLink{{Title: "post", Link: fullURL + ".rss"}}
|
|
||||||
}
|
|
||||||
|
|
||||||
func genRedditUserFeed(username string) []FeedLink {
|
|
||||||
return []FeedLink{
|
|
||||||
{Title: fmt.Sprintf("/u/%s overview new", username), Link: fmt.Sprintf("https://reddit.com/user/%s/.rss?sort=new", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s overview hot", username), Link: fmt.Sprintf("https://reddit.com/user/%s/.rss?sort=hot", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s overview top", username), Link: fmt.Sprintf("https://reddit.com/user/%s/.rss?sort=top", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s post new", username), Link: fmt.Sprintf("https://reddit.com/user/%s/submitted/.rss?sort=new", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s post hot", username), Link: fmt.Sprintf("https://reddit.com/user/%s/submitted/.rss?sort=hot", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s post top", username), Link: fmt.Sprintf("https://reddit.com/user/%s/submitted/.rss?sort=top", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s comments new", username), Link: fmt.Sprintf("https://reddit.com/user/%s/comments/.rss?sort=new", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s comments hot", username), Link: fmt.Sprintf("https://reddit.com/user/%s/comments/.rss?sort=hot", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s comments top", username), Link: fmt.Sprintf("https://reddit.com/user/%s/comments/.rss?sort=top", username)},
|
|
||||||
{Title: fmt.Sprintf("/u/%s awards received (legacy)", username), Link: fmt.Sprintf("https://old.reddit.com/user/%s/gilded/.rss", username)},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func genRedditDomainSubmissionFeed(domain string) []FeedLink {
|
|
||||||
return []FeedLink{{Title: "/domain/" + domain, Link: fmt.Sprintf("https://reddit.com/domain/%s/.rss", domain)}}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s Finder) youtubeMatcher(ctx context.Context) ([]FeedLink, error) {
|
|
||||||
if !strings.HasSuffix(s.target.Hostname(), "youtube.com") && !strings.HasSuffix(s.target.Hostname(), "youtu.be") {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
resp, err := s.httpClient.Get(s.target.String())
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
content, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(s.target.Path, "/@") {
|
|
||||||
re, err := regexp.Compile(`{"key":"browse_id","value":"(.+?)"}`)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
match := re.FindStringSubmatch(string(content))
|
|
||||||
if len(match) < 2 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
id := match[1]
|
|
||||||
if id == "" {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return []FeedLink{{Title: "Channel", Link: "https://www.youtube.com/feeds/videos.xml?channel_id=" + id}}, nil
|
|
||||||
} else if strings.HasPrefix(s.target.Path, "/playlist") {
|
|
||||||
id := s.target.Query().Get("list")
|
|
||||||
if id == "" {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return []FeedLink{{Title: "Playlist", Link: "https://www.youtube.com/feeds/videos.xml?playlist_id=" + id}}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
|
@ -1,71 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"net/url"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestGitHub(t *testing.T) {
|
|
||||||
type testItem struct {
|
|
||||||
url *url.URL
|
|
||||||
want []FeedLink
|
|
||||||
}
|
|
||||||
|
|
||||||
urlBase, _ := url.Parse("https://github.com?xxx=1")
|
|
||||||
urlUser, _ := url.Parse("https://github.com/user")
|
|
||||||
urlUserFeed := genGitHubUserFeed("user")
|
|
||||||
urlUserRepo, _ := url.Parse("https://github.com/user/repo")
|
|
||||||
urlRepoFeed := genGitHubRepoFeed("user/repo")
|
|
||||||
|
|
||||||
table := []testItem{
|
|
||||||
{url: urlBase, want: githubGlobalFeed},
|
|
||||||
{url: urlUser, want: urlUserFeed},
|
|
||||||
{url: urlUserRepo, want: urlRepoFeed},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range table {
|
|
||||||
finder := Finder{
|
|
||||||
target: tt.url,
|
|
||||||
}
|
|
||||||
feed, err := finder.githubMatcher(context.Background())
|
|
||||||
assert.Nil(t, err)
|
|
||||||
assert.ElementsMatch(t, tt.want, feed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestReddit(t *testing.T) {
|
|
||||||
type testItem struct {
|
|
||||||
url *url.URL
|
|
||||||
want []FeedLink
|
|
||||||
}
|
|
||||||
|
|
||||||
urlBase, _ := url.Parse("https://www.reddit.com")
|
|
||||||
urlSub, _ := url.Parse("https://www.reddit.com/r/homelab")
|
|
||||||
urlSubFeed := genRedditSubFeed("homelab")
|
|
||||||
urlComment, _ := url.Parse("https://www.reddit.com/r/homelab/comments/1234/xxx_xxx_xx/")
|
|
||||||
urlCommentFeed := genRedditCommentFeed("https://www.reddit.com/r/homelab/comments/1234/xxx_xxx_xx/")
|
|
||||||
urlUser, _ := url.Parse("https://www.reddit.com/user/x")
|
|
||||||
urlUserFeed := genRedditUserFeed("x")
|
|
||||||
urlDomainSubmission, _ := url.Parse("https://www.reddit.com/domain/github.com/")
|
|
||||||
urlDomainSubmissionFeed := genRedditDomainSubmissionFeed("github.com")
|
|
||||||
|
|
||||||
table := []testItem{
|
|
||||||
{url: urlBase, want: redditGlobalFeed},
|
|
||||||
{url: urlSub, want: urlSubFeed},
|
|
||||||
{url: urlComment, want: urlCommentFeed},
|
|
||||||
{url: urlUser, want: urlUserFeed},
|
|
||||||
{url: urlDomainSubmission, want: urlDomainSubmissionFeed},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range table {
|
|
||||||
finder := Finder{
|
|
||||||
target: tt.url,
|
|
||||||
}
|
|
||||||
feed, err := finder.redditMatcher(context.Background())
|
|
||||||
assert.Nil(t, err)
|
|
||||||
assert.ElementsMatch(t, tt.want, feed)
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,72 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"io"
|
|
||||||
"net/url"
|
|
||||||
|
|
||||||
"github.com/mmcdole/gofeed"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (f *Finder) tryWellKnown(ctx context.Context, baseURL string) ([]FeedLink, error) {
|
|
||||||
wellKnown := []string{
|
|
||||||
"atom.xml",
|
|
||||||
"feed.xml",
|
|
||||||
"rss.xml",
|
|
||||||
"index.xml",
|
|
||||||
"atom.json",
|
|
||||||
"feed.json",
|
|
||||||
"rss.json",
|
|
||||||
"index.json",
|
|
||||||
"feed/",
|
|
||||||
"rss/",
|
|
||||||
}
|
|
||||||
feeds := make([]FeedLink, 0)
|
|
||||||
|
|
||||||
for _, suffix := range wellKnown {
|
|
||||||
newTarget, err := url.JoinPath(baseURL, suffix)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
feed, err := f.parseRSSUrl(ctx, newTarget)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !isEmptyFeedLink(feed) {
|
|
||||||
feed.Link = newTarget // this may be more accurate than the link parsed from the rss content
|
|
||||||
feeds = append(feeds, feed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return feeds, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *Finder) parseRSSUrl(ctx context.Context, target string) (FeedLink, error) {
|
|
||||||
resp, err := f.httpClient.Get(target)
|
|
||||||
if err != nil {
|
|
||||||
return FeedLink{}, err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
content, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return FeedLink{}, err
|
|
||||||
}
|
|
||||||
return parseRSSContent(content)
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseRSSContent(content []byte) (FeedLink, error) {
|
|
||||||
parsed, err := gofeed.NewParser().Parse(bytes.NewReader(content))
|
|
||||||
if err != nil || parsed == nil {
|
|
||||||
return FeedLink{}, err
|
|
||||||
}
|
|
||||||
return FeedLink{
|
|
||||||
// https://github.com/mmcdole/gofeed#default-mappings
|
|
||||||
Title: parsed.Title,
|
|
||||||
|
|
||||||
// set as default value, but the value parsed from rss are not always accurate.
|
|
||||||
// it is better to use the url that gets the rss content.
|
|
||||||
Link: parsed.FeedLink,
|
|
||||||
}, nil
|
|
||||||
}
|
|
|
@ -1,42 +0,0 @@
|
||||||
package feedfinder
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestParseRSSContent(t *testing.T) {
|
|
||||||
type testItem struct {
|
|
||||||
content []byte
|
|
||||||
want FeedLink
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: match all types, e.g. https://github.com/mmcdole/gofeed/tree/master/testdata
|
|
||||||
table := []testItem{
|
|
||||||
{content: []byte(`
|
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
|
||||||
<channel>
|
|
||||||
<title>test</title>
|
|
||||||
<link>https://example.com/</link>
|
|
||||||
<language>en</language>
|
|
||||||
<lastBuildDate>Fri, 24 Feb 2023 00:43:57 +0800</lastBuildDate>
|
|
||||||
<atom:link href="https://example.com/feed.xml" rel="self" type="application/rss+xml"/>
|
|
||||||
<item>
|
|
||||||
<title>post1</title>
|
|
||||||
<link>https://example.com/post1/</link>
|
|
||||||
<pubDate>Fri, 24 Feb 2023 00:43:57 +0800</pubDate>
|
|
||||||
<guid>https://example.com/post1/</guid>
|
|
||||||
</item>
|
|
||||||
</channel>
|
|
||||||
</rss>
|
|
||||||
`), want: FeedLink{Title: "test", Link: "https://example.com/feed.xml"}},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range table {
|
|
||||||
feed, err := parseRSSContent(tt.content)
|
|
||||||
assert.Nil(t, err)
|
|
||||||
assert.Equal(t, tt.want, feed)
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -7,8 +7,8 @@ import (
|
||||||
"net/url"
|
"net/url"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"github.com/0x2E/feedfinder"
|
||||||
"github.com/0x2e/fusion/model"
|
"github.com/0x2e/fusion/model"
|
||||||
"github.com/0x2e/fusion/pkg/feedfinder"
|
|
||||||
"github.com/0x2e/fusion/repo"
|
"github.com/0x2e/fusion/repo"
|
||||||
"github.com/0x2e/fusion/service/pull"
|
"github.com/0x2e/fusion/service/pull"
|
||||||
"github.com/0x2e/fusion/service/pull/client"
|
"github.com/0x2e/fusion/service/pull/client"
|
||||||
|
@ -140,8 +140,8 @@ func (f Feed) CheckValidity(ctx context.Context, req *ReqFeedCheckValidity) (*Re
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
sniffed, err := feedfinder.Find(ctx, target, feedfinder.Options{
|
sniffed, err := feedfinder.Find(ctx, target.String(), &feedfinder.Options{
|
||||||
ReqProxy: req.RequestOptions.Proxy,
|
RequestProxy: req.RequestOptions.Proxy,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue