fix HREF parsing to limit to <a/> tags only (case insensitive)

This commit is contained in:
Wouter Groeneveld 2022-04-26 14:44:08 +02:00
parent eec2ed69f8
commit 3f1ece1c39
3 changed files with 140 additions and 81 deletions

View File

@ -16,7 +16,7 @@ func TestDiscoverRssFeedE2EBrainbaking(t *testing.T) {
} }
link, err := sender.discoverRssFeed("brainbaking.com") link, err := sender.discoverRssFeed("brainbaking.com")
expectedUrl := "https://brainbaking.com/index.xml" expectedUrl := "https://brainbaking.com/all/index.xml"
assert.Nil(t, err) assert.Nil(t, err)
assert.Truef(t, strings.HasPrefix(link, expectedUrl), "should start with %s, but was %s", expectedUrl, link) assert.Truef(t, strings.HasPrefix(link, expectedUrl), "should start with %s, but was %s", expectedUrl, link)
} }

View File

@ -56,8 +56,8 @@ func (snder *Sender) Collect(xml string, lastSentLink string) ([]RSSItem, error)
} }
var ( var (
hrefRegexp = regexp.MustCompile(`href="(.+?)"`) hrefRegexp = regexp.MustCompile(`(?i)<a.+?href="(.+?)"`)
extRegexp = regexp.MustCompile(`\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`) extRegexp = regexp.MustCompile(`(?i)\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`)
) )
func (snder *Sender) collectUniqueHrefsFromHtml(html string) []string { func (snder *Sender) collectUniqueHrefsFromHtml(html string) []string {

View File

@ -3,95 +3,154 @@ package send
import ( import (
"brainbaking.com/go-jamming/common" "brainbaking.com/go-jamming/common"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
"io/ioutil" "io/ioutil"
"testing" "testing"
) )
type CollectSuite struct { func TestCollectUniqueHrefsFromHtml(t *testing.T) {
suite.Suite cases := []struct {
xml string label string
snder *Sender html string
expectedLinks []string
}{
{
"should not contain inline links",
`<html><body><a href="#inline">sup</a></body></html>`,
[]string{},
},
{
"should not collect blacklisted links",
`<html><body><a href="https://www.blacklisted.com/wowo.html">sup</a> and also <a href="/dinges">dinges</a>!</body></html>`,
[]string{
"/dinges",
},
},
{
"should not collect hrefs from <link/> tags, only from <a/> ones",
`<html><head><link rel="stylesheet" href="/style.css"></head><body><a href="/dinges">dinges</a>!</body></html>`,
[]string{
"/dinges",
},
},
{
"should collect even if href is not the first attribute of an <a> tag",
`<html><body><a style="cool" target="_blank" href="/one">one</a> and <a target="_blank" href="/two">two</a> and <a href="/three">three</a></body></html>`,
[]string{
"/one",
"/two",
"/three",
},
},
{
"should collect case insensitive",
`<html><body><A href="/one">one</A> and <a href="/two">two</a> and <a HREF="/three">three</a></body></html>`,
[]string{
"/one",
"/two",
"/three",
},
},
{
"should not collect zips or ZIPs or gifs or GIFS",
`<a href="/cool.gif">cool gif</a> and <a href="/more-cool.GIF">more-cool gif</a> and here's a zip: <a href="baf.ZIP">baf</a> or <a href="boef.zip">boef.zip</a>??'`,
[]string{},
},
}
s := &Sender{
Conf: &common.Config{
Blacklist: []string{
"blacklisted.com",
},
},
}
for _, tc := range cases {
t.Run(tc.label, func(t *testing.T) {
result := s.collectUniqueHrefsFromHtml(tc.html)
assert.ElementsMatch(t, tc.expectedLinks, result)
})
}
} }
func (s *CollectSuite) SetupTest() { func TestCollect(t *testing.T) {
file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml") file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml")
s.xml = string(file) snder := &Sender{
s.snder = &Sender{
Conf: &common.Config{ Conf: &common.Config{
Blacklist: []string{ Blacklist: []string{
"youtube.com", "youtube.com",
}, },
}, },
} }
}
cases := []struct {
func TestCollectSuite(t *testing.T) { label string
suite.Run(t, new(CollectSuite)) lastsentlink string
} expectedRssItems int
expectedLastLinks []string
func (s *CollectSuite) TestCollectUniqueHrefsFromHtmlShouldNotContainInlineLinks() { }{
links := s.snder.collectUniqueHrefsFromHtml(`<html><body><a href="#inline">sup</a></body></html>`) {
assert.Empty(s.T(), links) "should not contain hrefs from blocked domains",
} "https://brainbaking.com/notes/2021/03/09h15m17s30/",
10,
func (s *CollectSuite) TestCollectShouldNotContainHrefsFromBlockedDomains() { []string{
items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/09h15m17s30/") "https://dog.estate/@eli_oat",
assert.NoError(s.T(), err) "https://twitter.com/olesovhcom/status/1369478732247932929",
last := items[len(items)-1] "/about",
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/10h16m24s22/", last.link) },
assert.ElementsMatch(s.T(), []string{ },
"https://dog.estate/@eli_oat", {
"https://twitter.com/olesovhcom/status/1369478732247932929", // test case: contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg
"/about", "should not contain hrefs that point to images",
}, last.hrefs) "https://brainbaking.com/notes/2021/03/13h12m44s29/",
} 4,
[]string{
func (s *CollectSuite) TestCollectShouldNotContainHrefsThatPointToImages() { "/about",
items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/13h12m44s29/") },
assert.NoError(s.T(), err) },
last := items[len(items)-1] {
// test case: "collects nothing if nothing new in feed",
// contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg "https://brainbaking.com/notes/2021/03/16h17m07s14/",
assert.ElementsMatch(s.T(), []string{ 0,
"/about", []string{},
}, last.hrefs) },
} {
"collect latest X links when a recent link parameter is provided",
func (s *CollectSuite) TestCollectNothingIfNothingNewInFeed() { "https://brainbaking.com/notes/2021/03/14h17m41s53/",
latestEntry := "https://brainbaking.com/notes/2021/03/16h17m07s14/" 3,
items, err := s.snder.Collect(s.xml, latestEntry) []string{
assert.NoError(s.T(), err) "http://replit.com",
assert.Equal(s.T(), 0, len(items)) "http://codepen.io",
} "https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/",
"/about",
func (s *CollectSuite) TestCollectLatestXLinksWhenARecentLinkParameterIsProvided() { },
items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/14h17m41s53/") },
assert.NoError(s.T(), err) {
assert.Equal(s.T(), 3, len(items)) "collect every external link without a recent link",
"",
last := items[len(items)-1] 141,
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/15h14m43s49/", last.link) []string{
assert.ElementsMatch(s.T(), []string{ "/notes/index.xml",
"http://replit.com", "/archives",
"http://codepen.io", "/categories/hardware/index.xml",
"https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/", "/about",
"/about", "https://netnewswire.com/",
}, last.hrefs) "/index.xml",
"brainbaking.com",
} "/post/index.xml",
},
func (s *CollectSuite) TestCollectEveryExternalLinkWithoutARecentLink() { },
items, err := s.snder.Collect(s.xml, "") }
assert.NoError(s.T(), err)
assert.Equal(s.T(), 141, len(items)) for _, tc := range cases {
t.Run(tc.label, func(t *testing.T) {
first := items[0] items, err := snder.Collect(string(file), tc.lastsentlink)
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/16h17m07s14/", first.link) assert.NoError(t, err)
assert.ElementsMatch(s.T(), []string{ assert.Equal(t, tc.expectedRssItems, len(items))
"https://fosstodon.org/@celia",
"https://fosstodon.org/@kev", if tc.expectedRssItems > 0 {
"/about", last := items[len(items)-1]
}, first.hrefs) assert.ElementsMatch(t, tc.expectedLastLinks, last.hrefs)
}
})
}
} }