forked from 0x2E/fusion
Extract feedfinder (#154)
* refactor: extract feedfinder to a standalone package * fix: increase timeout for feed validation to improve reliability
This commit is contained in:
parent
b83b868fc7
commit
5f527b57a7
13 changed files with 8 additions and 747 deletions
|
@ -30,7 +30,7 @@ export type FeedRequestOptions = {
|
|||
export async function checkValidity(link: string, options: FeedRequestOptions) {
|
||||
const resp = await api
|
||||
.post('feeds/validation', {
|
||||
timeout: 10000,
|
||||
timeout: 30000,
|
||||
json: { link: link, request_options: options }
|
||||
})
|
||||
.json<{ feed_links: { title: string; link: string }[] }>();
|
||||
|
|
3
go.mod
3
go.mod
|
@ -3,7 +3,7 @@ module github.com/0x2e/fusion
|
|||
go 1.24
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.10.3
|
||||
github.com/0x2E/feedfinder v0.0.3
|
||||
github.com/caarlos0/env/v11 v11.3.1
|
||||
github.com/glebarez/sqlite v1.11.0
|
||||
github.com/go-playground/locales v0.14.1
|
||||
|
@ -21,6 +21,7 @@ require (
|
|||
)
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.10.3 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
|
|
2
go.sum
2
go.sum
|
@ -1,3 +1,5 @@
|
|||
github.com/0x2E/feedfinder v0.0.3 h1:B5jXpsQPelV+YcCigqAm7AXlFYml3X8A+5ug8FJ02/4=
|
||||
github.com/0x2E/feedfinder v0.0.3/go.mod h1:/+tl3hTDZetFvRLsphmRqn3081E8nVtGhp28mUOZtOk=
|
||||
github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
|
||||
github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
|
|
|
@ -1,132 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type FeedLink struct {
|
||||
Title string `json:"title"`
|
||||
Link string `json:"link"`
|
||||
}
|
||||
|
||||
type Finder struct {
|
||||
target *url.URL
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
ReqProxy *string
|
||||
}
|
||||
|
||||
func Find(ctx context.Context, target *url.URL, options Options) ([]FeedLink, error) {
|
||||
clientTransportOps := []transportOptionFunc{}
|
||||
if options.ReqProxy != nil && *options.ReqProxy != "" {
|
||||
proxyURL, err := url.Parse(*options.ReqProxy)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
clientTransportOps = append(clientTransportOps, func(transport *http.Transport) {
|
||||
transport.Proxy = http.ProxyURL(proxyURL)
|
||||
})
|
||||
}
|
||||
|
||||
finder := Finder{
|
||||
target: target,
|
||||
httpClient: newClient(clientTransportOps...),
|
||||
}
|
||||
return finder.Run(context.Background())
|
||||
}
|
||||
|
||||
func (f *Finder) Run(ctx context.Context) ([]FeedLink, error) {
|
||||
// find in third-party service
|
||||
logger := slog.With("step", "third-party service")
|
||||
fromService, err := f.tryService(ctx)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
if len(fromService) != 0 {
|
||||
return fromService, nil
|
||||
}
|
||||
|
||||
feedMap := make(map[string]FeedLink)
|
||||
mu := sync.Mutex{}
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
// sniff in HTML
|
||||
logger := slog.With("step", "page")
|
||||
data, err := f.tryPageSource(ctx)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
for _, f := range data {
|
||||
feedMap[f.Link] = f
|
||||
}
|
||||
mu.Unlock()
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
// sniff well-knowns under this url
|
||||
logger := logger.With("step", "well-knowns")
|
||||
data, err := f.tryWellKnown(ctx, fmt.Sprintf("%s://%s%s", f.target.Scheme, f.target.Host, f.target.Path))
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
if len(data) == 0 {
|
||||
// sniff well-knowns under root path
|
||||
data, err = f.tryWellKnown(ctx, fmt.Sprintf("%s://%s", f.target.Scheme, f.target.Host))
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
for _, f := range data {
|
||||
feedMap[f.Link] = f
|
||||
}
|
||||
mu.Unlock()
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
res := make([]FeedLink, 0, len(feedMap))
|
||||
for _, f := range feedMap {
|
||||
res = append(res, f)
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func isEmptyFeedLink(feed FeedLink) bool {
|
||||
return feed == FeedLink{}
|
||||
}
|
||||
|
||||
func formatLinkToAbs(base, link string) string {
|
||||
if link == "" {
|
||||
return base
|
||||
}
|
||||
linkURL, err := url.Parse(link)
|
||||
if err != nil {
|
||||
return link
|
||||
}
|
||||
if linkURL.IsAbs() {
|
||||
return link
|
||||
}
|
||||
|
||||
baseURL, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return link
|
||||
}
|
||||
return baseURL.ResolveReference(linkURL).String()
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestFormatLinkToAbs(t *testing.T) {
|
||||
type item struct {
|
||||
base string
|
||||
link string
|
||||
want string
|
||||
}
|
||||
table := []item{
|
||||
{base: "https://x.xx", link: "https://1.xx", want: "https://1.xx"},
|
||||
{base: "https://x.xx", link: "", want: "https://x.xx"},
|
||||
{base: "https://x.xx/1/", link: "/x/index.xml", want: "https://x.xx/x/index.xml"},
|
||||
{base: "https://x.xx/1/", link: "x/index.xml", want: "https://x.xx/1/x/index.xml"},
|
||||
{base: "https://x.xx/1", link: "index.xml", want: "https://x.xx/index.xml"},
|
||||
}
|
||||
|
||||
for _, tt := range table {
|
||||
res := formatLinkToAbs(tt.base, tt.link)
|
||||
assert.Equal(t, tt.want, res)
|
||||
}
|
||||
}
|
|
@ -1,107 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
func (f *Finder) tryPageSource(ctx context.Context) ([]FeedLink, error) {
|
||||
resp, err := f.httpClient.Get(f.target.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
content, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("bad status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
feeds, err := f.parseHTMLContent(ctx, content)
|
||||
if err != nil {
|
||||
slog.Error(err.Error(), "content_type", "HTML")
|
||||
}
|
||||
if len(feeds) != 0 {
|
||||
for i := range feeds {
|
||||
feed := &feeds[i]
|
||||
feed.Link = formatLinkToAbs(f.target.String(), feed.Link)
|
||||
}
|
||||
return feeds, nil
|
||||
}
|
||||
|
||||
feed, err := parseRSSContent(content)
|
||||
if err != nil {
|
||||
slog.Error(err.Error(), "content_type", "RSS")
|
||||
}
|
||||
if !isEmptyFeedLink(feed) {
|
||||
if feed.Link == "" {
|
||||
feed.Link = f.target.String()
|
||||
}
|
||||
return []FeedLink{feed}, nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f *Finder) parseHTMLContent(ctx context.Context, content []byte) ([]FeedLink, error) {
|
||||
feeds := make([]FeedLink, 0)
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(content))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pageTitle := doc.FindMatcher(goquery.Single("title")).Text()
|
||||
|
||||
// find <link> type rss in <header>
|
||||
linkExprs := []string{
|
||||
"link[type='application/rss+xml']",
|
||||
"link[type='application/atom+xml']",
|
||||
"link[type='application/json']",
|
||||
"link[type='application/feed+json']",
|
||||
}
|
||||
for _, expr := range linkExprs {
|
||||
doc.Find("head").Find(expr).Each(func(_ int, s *goquery.Selection) {
|
||||
feed := FeedLink{}
|
||||
feed.Title, _ = s.Attr("title")
|
||||
feed.Link, _ = s.Attr("href")
|
||||
|
||||
if feed.Title == "" {
|
||||
feed.Title = pageTitle
|
||||
}
|
||||
feeds = append(feeds, feed)
|
||||
})
|
||||
}
|
||||
|
||||
// find <a> type rss in <body>
|
||||
aExpr := "a:contains('rss')"
|
||||
suspected := make(map[string]struct{})
|
||||
doc.Find("body").Find(aExpr).Each(func(_ int, s *goquery.Selection) {
|
||||
link, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
suspected[link] = struct{}{}
|
||||
})
|
||||
for link := range suspected {
|
||||
feed, err := f.parseRSSUrl(ctx, link)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !isEmptyFeedLink(feed) {
|
||||
feed.Link = link // this may be more accurate than the link parsed from the rss content
|
||||
feeds = append(feeds, feed)
|
||||
}
|
||||
}
|
||||
|
||||
return feeds, nil
|
||||
}
|
|
@ -1,70 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type testParseHTMLContentItem struct {
|
||||
content []byte
|
||||
want []FeedLink
|
||||
}
|
||||
|
||||
func TestParseHTMLContentMatchLink(t *testing.T) {
|
||||
table := []testParseHTMLContentItem{
|
||||
{content: []byte(`
|
||||
<html>
|
||||
<head>
|
||||
<title>html title</title>
|
||||
<link type="application/rss+xml" title="feed title" href="https://example.com/x/rss.xml">
|
||||
<link type="application/atom+xml" href="https://example.com/x/atom.xml">
|
||||
</head>
|
||||
<body>
|
||||
<link type="application/feed+json" title="link in body" href="https://example.com/x/feed.json">
|
||||
</body>
|
||||
</html>
|
||||
`), want: []FeedLink{
|
||||
{Title: "feed title", Link: "https://example.com/x/rss.xml"},
|
||||
{Title: "html title", Link: "https://example.com/x/atom.xml"},
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range table {
|
||||
finder := Finder{}
|
||||
feed, err := finder.parseHTMLContent(context.Background(), tt.content)
|
||||
assert.Nil(t, err)
|
||||
assert.ElementsMatch(t, tt.want, feed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseHTMLContentMatchLinkElement(t *testing.T) {
|
||||
table := []testParseHTMLContentItem{
|
||||
// match <a>
|
||||
{content: []byte(`
|
||||
<html>
|
||||
<head><title>html title</title></head>
|
||||
<body>
|
||||
<p>xxx</p>
|
||||
<main>
|
||||
<p>xxx</p>
|
||||
<a href="https://github.com/golang/go/releases.atom">RSS: Release notes from go</a>
|
||||
</main>
|
||||
<footer>
|
||||
<a href="https://github.com/golang/go">wrong rss</a>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
`), want: []FeedLink{
|
||||
{Title: "Release notes from go", Link: "https://github.com/golang/go/releases.atom"},
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range table {
|
||||
finder := Finder{httpClient: newClient()}
|
||||
feed, err := finder.parseHTMLContent(context.Background(), tt.content)
|
||||
assert.Nil(t, err)
|
||||
assert.ElementsMatch(t, tt.want, feed)
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type transportOptionFunc func(transport *http.Transport)
|
||||
|
||||
func newClient(options ...transportOptionFunc) *http.Client {
|
||||
transport := http.DefaultTransport.(*http.Transport).Clone()
|
||||
transport.DisableKeepAlives = true
|
||||
transport.ForceAttemptHTTP2 = true
|
||||
|
||||
for _, optionFunc := range options {
|
||||
optionFunc(transport)
|
||||
}
|
||||
|
||||
return &http.Client{
|
||||
Transport: transport,
|
||||
Timeout: 1 * time.Minute,
|
||||
}
|
||||
}
|
|
@ -1,198 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type serviceMatcher func(context.Context) ([]FeedLink, error)
|
||||
|
||||
func (f *Finder) tryService(ctx context.Context) ([]FeedLink, error) {
|
||||
matcher := []serviceMatcher{
|
||||
f.githubMatcher,
|
||||
f.redditMatcher,
|
||||
f.youtubeMatcher,
|
||||
}
|
||||
for _, fn := range matcher {
|
||||
feed, err := fn(ctx)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if len(feed) != 0 {
|
||||
return feed, nil
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var githubGlobalFeed = []FeedLink{
|
||||
{Title: "global public timeline", Link: "https://github.com/timeline"},
|
||||
{Title: "global security advisories", Link: "https://github.com/security-advisories.atom"},
|
||||
}
|
||||
|
||||
// https://docs.github.com/en/rest/activity/feeds?apiVersion=2022-11-28#get-feeds
|
||||
func (s Finder) githubMatcher(ctx context.Context) ([]FeedLink, error) {
|
||||
if !strings.HasSuffix(s.target.Hostname(), "github.com") {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
splited := strings.SplitN(s.target.Path, "/", 4) // split "/user/repo/" -> []string{"", "user", "repo", ""}
|
||||
splitedLen := len(splited)
|
||||
if splitedLen < 2 {
|
||||
return githubGlobalFeed, nil
|
||||
}
|
||||
|
||||
username, reponame := "", ""
|
||||
if splitedLen >= 2 {
|
||||
username = splited[1]
|
||||
}
|
||||
if splitedLen >= 3 {
|
||||
reponame = splited[2]
|
||||
}
|
||||
|
||||
if reponame != "" {
|
||||
re, err := regexp.Compile(`^[a-zA-Z0-9][a-zA-Z0-9-_\.]{0,98}[a-zA-Z0-9]$`) // todo need improve
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !re.MatchString(reponame) {
|
||||
return nil, nil
|
||||
}
|
||||
return genGitHubRepoFeed(username + "/" + reponame), nil
|
||||
}
|
||||
|
||||
if username != "" {
|
||||
re, err := regexp.Compile(`^[a-zA-Z0-9][-]?[a-zA-Z0-9]{0,38}$`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !re.MatchString(username) {
|
||||
return nil, nil
|
||||
}
|
||||
return genGitHubUserFeed(username), nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func genGitHubUserFeed(username string) []FeedLink {
|
||||
return []FeedLink{{Title: username + " public timeline", Link: fmt.Sprintf("https://github.com/%s.atom", username)}}
|
||||
}
|
||||
|
||||
func genGitHubRepoFeed(userRepo string) []FeedLink {
|
||||
return []FeedLink{
|
||||
{Title: fmt.Sprintf("%s commits", userRepo), Link: fmt.Sprintf("https://github.com/%s/commits.atom", userRepo)},
|
||||
{Title: fmt.Sprintf("%s releases", userRepo), Link: fmt.Sprintf("https://github.com/%s/releases.atom", userRepo)},
|
||||
{Title: fmt.Sprintf("%s tags", userRepo), Link: fmt.Sprintf("https://github.com/%s/tags.atom", userRepo)},
|
||||
{Title: fmt.Sprintf("%s wiki", userRepo), Link: fmt.Sprintf("https://github.com/%s/wiki.atom", userRepo)},
|
||||
}
|
||||
}
|
||||
|
||||
var redditGlobalFeed = []FeedLink{
|
||||
{Title: "global", Link: "https://www.reddit.com/.rss"},
|
||||
}
|
||||
|
||||
// https://www.reddit.com/wiki/rss/
|
||||
func (s Finder) redditMatcher(ctx context.Context) ([]FeedLink, error) {
|
||||
if !strings.HasSuffix(s.target.Hostname(), "reddit.com") {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
splited := strings.SplitN(s.target.Path, "/", 4)
|
||||
splitedLen := len(splited)
|
||||
if splitedLen < 2 {
|
||||
return redditGlobalFeed, nil
|
||||
}
|
||||
|
||||
mode, param := splited[1], splited[2]
|
||||
switch mode {
|
||||
case "r":
|
||||
if splitedLen >= 4 && strings.HasPrefix(splited[3], "comments") {
|
||||
// "comments/{postID}/{title}"
|
||||
// "comments/{postID}/{title}/comment/{commentID}"
|
||||
return genRedditCommentFeed(s.target.String()), nil
|
||||
}
|
||||
return genRedditSubFeed(param), nil
|
||||
case "user":
|
||||
return genRedditUserFeed(param), nil
|
||||
case "domain":
|
||||
return genRedditDomainSubmissionFeed(param), nil
|
||||
default:
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func genRedditSubFeed(sub string) []FeedLink {
|
||||
return []FeedLink{
|
||||
{Title: fmt.Sprintf("/r/%s hot", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/hot/.rss", sub)},
|
||||
{Title: fmt.Sprintf("/r/%s new", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/new/.rss", sub)},
|
||||
{Title: fmt.Sprintf("/r/%s top", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/top/.rss", sub)},
|
||||
{Title: fmt.Sprintf("/r/%s rising", sub), Link: fmt.Sprintf("https://reddit.com/r/%s/rising/.rss", sub)},
|
||||
}
|
||||
}
|
||||
|
||||
func genRedditCommentFeed(fullURL string) []FeedLink {
|
||||
return []FeedLink{{Title: "post", Link: fullURL + ".rss"}}
|
||||
}
|
||||
|
||||
func genRedditUserFeed(username string) []FeedLink {
|
||||
return []FeedLink{
|
||||
{Title: fmt.Sprintf("/u/%s overview new", username), Link: fmt.Sprintf("https://reddit.com/user/%s/.rss?sort=new", username)},
|
||||
{Title: fmt.Sprintf("/u/%s overview hot", username), Link: fmt.Sprintf("https://reddit.com/user/%s/.rss?sort=hot", username)},
|
||||
{Title: fmt.Sprintf("/u/%s overview top", username), Link: fmt.Sprintf("https://reddit.com/user/%s/.rss?sort=top", username)},
|
||||
{Title: fmt.Sprintf("/u/%s post new", username), Link: fmt.Sprintf("https://reddit.com/user/%s/submitted/.rss?sort=new", username)},
|
||||
{Title: fmt.Sprintf("/u/%s post hot", username), Link: fmt.Sprintf("https://reddit.com/user/%s/submitted/.rss?sort=hot", username)},
|
||||
{Title: fmt.Sprintf("/u/%s post top", username), Link: fmt.Sprintf("https://reddit.com/user/%s/submitted/.rss?sort=top", username)},
|
||||
{Title: fmt.Sprintf("/u/%s comments new", username), Link: fmt.Sprintf("https://reddit.com/user/%s/comments/.rss?sort=new", username)},
|
||||
{Title: fmt.Sprintf("/u/%s comments hot", username), Link: fmt.Sprintf("https://reddit.com/user/%s/comments/.rss?sort=hot", username)},
|
||||
{Title: fmt.Sprintf("/u/%s comments top", username), Link: fmt.Sprintf("https://reddit.com/user/%s/comments/.rss?sort=top", username)},
|
||||
{Title: fmt.Sprintf("/u/%s awards received (legacy)", username), Link: fmt.Sprintf("https://old.reddit.com/user/%s/gilded/.rss", username)},
|
||||
}
|
||||
}
|
||||
|
||||
func genRedditDomainSubmissionFeed(domain string) []FeedLink {
|
||||
return []FeedLink{{Title: "/domain/" + domain, Link: fmt.Sprintf("https://reddit.com/domain/%s/.rss", domain)}}
|
||||
}
|
||||
|
||||
func (s Finder) youtubeMatcher(ctx context.Context) ([]FeedLink, error) {
|
||||
if !strings.HasSuffix(s.target.Hostname(), "youtube.com") && !strings.HasSuffix(s.target.Hostname(), "youtu.be") {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
resp, err := s.httpClient.Get(s.target.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
content, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.HasPrefix(s.target.Path, "/@") {
|
||||
re, err := regexp.Compile(`{"key":"browse_id","value":"(.+?)"}`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
match := re.FindStringSubmatch(string(content))
|
||||
if len(match) < 2 {
|
||||
return nil, nil
|
||||
}
|
||||
id := match[1]
|
||||
if id == "" {
|
||||
return nil, nil
|
||||
}
|
||||
return []FeedLink{{Title: "Channel", Link: "https://www.youtube.com/feeds/videos.xml?channel_id=" + id}}, nil
|
||||
} else if strings.HasPrefix(s.target.Path, "/playlist") {
|
||||
id := s.target.Query().Get("list")
|
||||
if id == "" {
|
||||
return nil, nil
|
||||
}
|
||||
return []FeedLink{{Title: "Playlist", Link: "https://www.youtube.com/feeds/videos.xml?playlist_id=" + id}}, nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
|
@ -1,71 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestGitHub(t *testing.T) {
|
||||
type testItem struct {
|
||||
url *url.URL
|
||||
want []FeedLink
|
||||
}
|
||||
|
||||
urlBase, _ := url.Parse("https://github.com?xxx=1")
|
||||
urlUser, _ := url.Parse("https://github.com/user")
|
||||
urlUserFeed := genGitHubUserFeed("user")
|
||||
urlUserRepo, _ := url.Parse("https://github.com/user/repo")
|
||||
urlRepoFeed := genGitHubRepoFeed("user/repo")
|
||||
|
||||
table := []testItem{
|
||||
{url: urlBase, want: githubGlobalFeed},
|
||||
{url: urlUser, want: urlUserFeed},
|
||||
{url: urlUserRepo, want: urlRepoFeed},
|
||||
}
|
||||
|
||||
for _, tt := range table {
|
||||
finder := Finder{
|
||||
target: tt.url,
|
||||
}
|
||||
feed, err := finder.githubMatcher(context.Background())
|
||||
assert.Nil(t, err)
|
||||
assert.ElementsMatch(t, tt.want, feed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReddit(t *testing.T) {
|
||||
type testItem struct {
|
||||
url *url.URL
|
||||
want []FeedLink
|
||||
}
|
||||
|
||||
urlBase, _ := url.Parse("https://www.reddit.com")
|
||||
urlSub, _ := url.Parse("https://www.reddit.com/r/homelab")
|
||||
urlSubFeed := genRedditSubFeed("homelab")
|
||||
urlComment, _ := url.Parse("https://www.reddit.com/r/homelab/comments/1234/xxx_xxx_xx/")
|
||||
urlCommentFeed := genRedditCommentFeed("https://www.reddit.com/r/homelab/comments/1234/xxx_xxx_xx/")
|
||||
urlUser, _ := url.Parse("https://www.reddit.com/user/x")
|
||||
urlUserFeed := genRedditUserFeed("x")
|
||||
urlDomainSubmission, _ := url.Parse("https://www.reddit.com/domain/github.com/")
|
||||
urlDomainSubmissionFeed := genRedditDomainSubmissionFeed("github.com")
|
||||
|
||||
table := []testItem{
|
||||
{url: urlBase, want: redditGlobalFeed},
|
||||
{url: urlSub, want: urlSubFeed},
|
||||
{url: urlComment, want: urlCommentFeed},
|
||||
{url: urlUser, want: urlUserFeed},
|
||||
{url: urlDomainSubmission, want: urlDomainSubmissionFeed},
|
||||
}
|
||||
|
||||
for _, tt := range table {
|
||||
finder := Finder{
|
||||
target: tt.url,
|
||||
}
|
||||
feed, err := finder.redditMatcher(context.Background())
|
||||
assert.Nil(t, err)
|
||||
assert.ElementsMatch(t, tt.want, feed)
|
||||
}
|
||||
}
|
|
@ -1,72 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"io"
|
||||
"net/url"
|
||||
|
||||
"github.com/mmcdole/gofeed"
|
||||
)
|
||||
|
||||
func (f *Finder) tryWellKnown(ctx context.Context, baseURL string) ([]FeedLink, error) {
|
||||
wellKnown := []string{
|
||||
"atom.xml",
|
||||
"feed.xml",
|
||||
"rss.xml",
|
||||
"index.xml",
|
||||
"atom.json",
|
||||
"feed.json",
|
||||
"rss.json",
|
||||
"index.json",
|
||||
"feed/",
|
||||
"rss/",
|
||||
}
|
||||
feeds := make([]FeedLink, 0)
|
||||
|
||||
for _, suffix := range wellKnown {
|
||||
newTarget, err := url.JoinPath(baseURL, suffix)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
feed, err := f.parseRSSUrl(ctx, newTarget)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !isEmptyFeedLink(feed) {
|
||||
feed.Link = newTarget // this may be more accurate than the link parsed from the rss content
|
||||
feeds = append(feeds, feed)
|
||||
}
|
||||
}
|
||||
|
||||
return feeds, nil
|
||||
}
|
||||
|
||||
func (f *Finder) parseRSSUrl(ctx context.Context, target string) (FeedLink, error) {
|
||||
resp, err := f.httpClient.Get(target)
|
||||
if err != nil {
|
||||
return FeedLink{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
content, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return FeedLink{}, err
|
||||
}
|
||||
return parseRSSContent(content)
|
||||
}
|
||||
|
||||
func parseRSSContent(content []byte) (FeedLink, error) {
|
||||
parsed, err := gofeed.NewParser().Parse(bytes.NewReader(content))
|
||||
if err != nil || parsed == nil {
|
||||
return FeedLink{}, err
|
||||
}
|
||||
return FeedLink{
|
||||
// https://github.com/mmcdole/gofeed#default-mappings
|
||||
Title: parsed.Title,
|
||||
|
||||
// set as default value, but the value parsed from rss are not always accurate.
|
||||
// it is better to use the url that gets the rss content.
|
||||
Link: parsed.FeedLink,
|
||||
}, nil
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
package feedfinder
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestParseRSSContent(t *testing.T) {
|
||||
type testItem struct {
|
||||
content []byte
|
||||
want FeedLink
|
||||
}
|
||||
|
||||
// TODO: match all types, e.g. https://github.com/mmcdole/gofeed/tree/master/testdata
|
||||
table := []testItem{
|
||||
{content: []byte(`
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||
<channel>
|
||||
<title>test</title>
|
||||
<link>https://example.com/</link>
|
||||
<language>en</language>
|
||||
<lastBuildDate>Fri, 24 Feb 2023 00:43:57 +0800</lastBuildDate>
|
||||
<atom:link href="https://example.com/feed.xml" rel="self" type="application/rss+xml"/>
|
||||
<item>
|
||||
<title>post1</title>
|
||||
<link>https://example.com/post1/</link>
|
||||
<pubDate>Fri, 24 Feb 2023 00:43:57 +0800</pubDate>
|
||||
<guid>https://example.com/post1/</guid>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
`), want: FeedLink{Title: "test", Link: "https://example.com/feed.xml"}},
|
||||
}
|
||||
|
||||
for _, tt := range table {
|
||||
feed, err := parseRSSContent(tt.content)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, tt.want, feed)
|
||||
}
|
||||
}
|
|
@ -7,8 +7,8 @@ import (
|
|||
"net/url"
|
||||
"sync"
|
||||
|
||||
"github.com/0x2E/feedfinder"
|
||||
"github.com/0x2e/fusion/model"
|
||||
"github.com/0x2e/fusion/pkg/feedfinder"
|
||||
"github.com/0x2e/fusion/repo"
|
||||
"github.com/0x2e/fusion/service/pull"
|
||||
"github.com/0x2e/fusion/service/pull/client"
|
||||
|
@ -140,8 +140,8 @@ func (f Feed) CheckValidity(ctx context.Context, req *ReqFeedCheckValidity) (*Re
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sniffed, err := feedfinder.Find(ctx, target, feedfinder.Options{
|
||||
ReqProxy: req.RequestOptions.Proxy,
|
||||
sniffed, err := feedfinder.Find(ctx, target.String(), &feedfinder.Options{
|
||||
RequestProxy: req.RequestOptions.Proxy,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue