forked from wgroeneveld/go-jamming
fix HREF parsing to limit to <a/> tags only (case insensitive)
This commit is contained in:
parent
eec2ed69f8
commit
3f1ece1c39
|
@ -16,7 +16,7 @@ func TestDiscoverRssFeedE2EBrainbaking(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
link, err := sender.discoverRssFeed("brainbaking.com")
|
link, err := sender.discoverRssFeed("brainbaking.com")
|
||||||
expectedUrl := "https://brainbaking.com/index.xml"
|
expectedUrl := "https://brainbaking.com/all/index.xml"
|
||||||
assert.Nil(t, err)
|
assert.Nil(t, err)
|
||||||
assert.Truef(t, strings.HasPrefix(link, expectedUrl), "should start with %s, but was %s", expectedUrl, link)
|
assert.Truef(t, strings.HasPrefix(link, expectedUrl), "should start with %s, but was %s", expectedUrl, link)
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,8 +56,8 @@ func (snder *Sender) Collect(xml string, lastSentLink string) ([]RSSItem, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
hrefRegexp = regexp.MustCompile(`href="(.+?)"`)
|
hrefRegexp = regexp.MustCompile(`(?i)<a.+?href="(.+?)"`)
|
||||||
extRegexp = regexp.MustCompile(`\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`)
|
extRegexp = regexp.MustCompile(`(?i)\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func (snder *Sender) collectUniqueHrefsFromHtml(html string) []string {
|
func (snder *Sender) collectUniqueHrefsFromHtml(html string) []string {
|
||||||
|
|
|
@ -3,95 +3,154 @@ package send
|
||||||
import (
|
import (
|
||||||
"brainbaking.com/go-jamming/common"
|
"brainbaking.com/go-jamming/common"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/suite"
|
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
type CollectSuite struct {
|
func TestCollectUniqueHrefsFromHtml(t *testing.T) {
|
||||||
suite.Suite
|
cases := []struct {
|
||||||
xml string
|
label string
|
||||||
snder *Sender
|
html string
|
||||||
|
expectedLinks []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
"should not contain inline links",
|
||||||
|
`<html><body><a href="#inline">sup</a></body></html>`,
|
||||||
|
[]string{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"should not collect blacklisted links",
|
||||||
|
`<html><body><a href="https://www.blacklisted.com/wowo.html">sup</a> and also <a href="/dinges">dinges</a>!</body></html>`,
|
||||||
|
[]string{
|
||||||
|
"/dinges",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"should not collect hrefs from <link/> tags, only from <a/> ones",
|
||||||
|
`<html><head><link rel="stylesheet" href="/style.css"></head><body><a href="/dinges">dinges</a>!</body></html>`,
|
||||||
|
[]string{
|
||||||
|
"/dinges",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"should collect even if href is not the first attribute of an <a> tag",
|
||||||
|
`<html><body><a style="cool" target="_blank" href="/one">one</a> and <a target="_blank" href="/two">two</a> and <a href="/three">three</a></body></html>`,
|
||||||
|
[]string{
|
||||||
|
"/one",
|
||||||
|
"/two",
|
||||||
|
"/three",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"should collect case insensitive",
|
||||||
|
`<html><body><A href="/one">one</A> and <a href="/two">two</a> and <a HREF="/three">three</a></body></html>`,
|
||||||
|
[]string{
|
||||||
|
"/one",
|
||||||
|
"/two",
|
||||||
|
"/three",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"should not collect zips or ZIPs or gifs or GIFS",
|
||||||
|
`<a href="/cool.gif">cool gif</a> and <a href="/more-cool.GIF">more-cool gif</a> and here's a zip: <a href="baf.ZIP">baf</a> or <a href="boef.zip">boef.zip</a>??'`,
|
||||||
|
[]string{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
s := &Sender{
|
||||||
|
Conf: &common.Config{
|
||||||
|
Blacklist: []string{
|
||||||
|
"blacklisted.com",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.label, func(t *testing.T) {
|
||||||
|
result := s.collectUniqueHrefsFromHtml(tc.html)
|
||||||
|
assert.ElementsMatch(t, tc.expectedLinks, result)
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *CollectSuite) SetupTest() {
|
func TestCollect(t *testing.T) {
|
||||||
file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml")
|
file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml")
|
||||||
s.xml = string(file)
|
snder := &Sender{
|
||||||
s.snder = &Sender{
|
|
||||||
Conf: &common.Config{
|
Conf: &common.Config{
|
||||||
Blacklist: []string{
|
Blacklist: []string{
|
||||||
"youtube.com",
|
"youtube.com",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
cases := []struct {
|
||||||
func TestCollectSuite(t *testing.T) {
|
label string
|
||||||
suite.Run(t, new(CollectSuite))
|
lastsentlink string
|
||||||
}
|
expectedRssItems int
|
||||||
|
expectedLastLinks []string
|
||||||
func (s *CollectSuite) TestCollectUniqueHrefsFromHtmlShouldNotContainInlineLinks() {
|
}{
|
||||||
links := s.snder.collectUniqueHrefsFromHtml(`<html><body><a href="#inline">sup</a></body></html>`)
|
{
|
||||||
assert.Empty(s.T(), links)
|
"should not contain hrefs from blocked domains",
|
||||||
}
|
"https://brainbaking.com/notes/2021/03/09h15m17s30/",
|
||||||
|
10,
|
||||||
func (s *CollectSuite) TestCollectShouldNotContainHrefsFromBlockedDomains() {
|
[]string{
|
||||||
items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/09h15m17s30/")
|
"https://dog.estate/@eli_oat",
|
||||||
assert.NoError(s.T(), err)
|
"https://twitter.com/olesovhcom/status/1369478732247932929",
|
||||||
last := items[len(items)-1]
|
"/about",
|
||||||
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/10h16m24s22/", last.link)
|
},
|
||||||
assert.ElementsMatch(s.T(), []string{
|
},
|
||||||
"https://dog.estate/@eli_oat",
|
{
|
||||||
"https://twitter.com/olesovhcom/status/1369478732247932929",
|
// test case: contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg
|
||||||
"/about",
|
"should not contain hrefs that point to images",
|
||||||
}, last.hrefs)
|
"https://brainbaking.com/notes/2021/03/13h12m44s29/",
|
||||||
}
|
4,
|
||||||
|
[]string{
|
||||||
func (s *CollectSuite) TestCollectShouldNotContainHrefsThatPointToImages() {
|
"/about",
|
||||||
items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/13h12m44s29/")
|
},
|
||||||
assert.NoError(s.T(), err)
|
},
|
||||||
last := items[len(items)-1]
|
{
|
||||||
// test case:
|
"collects nothing if nothing new in feed",
|
||||||
// contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg
|
"https://brainbaking.com/notes/2021/03/16h17m07s14/",
|
||||||
assert.ElementsMatch(s.T(), []string{
|
0,
|
||||||
"/about",
|
[]string{},
|
||||||
}, last.hrefs)
|
},
|
||||||
}
|
{
|
||||||
|
"collect latest X links when a recent link parameter is provided",
|
||||||
func (s *CollectSuite) TestCollectNothingIfNothingNewInFeed() {
|
"https://brainbaking.com/notes/2021/03/14h17m41s53/",
|
||||||
latestEntry := "https://brainbaking.com/notes/2021/03/16h17m07s14/"
|
3,
|
||||||
items, err := s.snder.Collect(s.xml, latestEntry)
|
[]string{
|
||||||
assert.NoError(s.T(), err)
|
"http://replit.com",
|
||||||
assert.Equal(s.T(), 0, len(items))
|
"http://codepen.io",
|
||||||
}
|
"https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/",
|
||||||
|
"/about",
|
||||||
func (s *CollectSuite) TestCollectLatestXLinksWhenARecentLinkParameterIsProvided() {
|
},
|
||||||
items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/14h17m41s53/")
|
},
|
||||||
assert.NoError(s.T(), err)
|
{
|
||||||
assert.Equal(s.T(), 3, len(items))
|
"collect every external link without a recent link",
|
||||||
|
"",
|
||||||
last := items[len(items)-1]
|
141,
|
||||||
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/15h14m43s49/", last.link)
|
[]string{
|
||||||
assert.ElementsMatch(s.T(), []string{
|
"/notes/index.xml",
|
||||||
"http://replit.com",
|
"/archives",
|
||||||
"http://codepen.io",
|
"/categories/hardware/index.xml",
|
||||||
"https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/",
|
"/about",
|
||||||
"/about",
|
"https://netnewswire.com/",
|
||||||
}, last.hrefs)
|
"/index.xml",
|
||||||
|
"brainbaking.com",
|
||||||
}
|
"/post/index.xml",
|
||||||
|
},
|
||||||
func (s *CollectSuite) TestCollectEveryExternalLinkWithoutARecentLink() {
|
},
|
||||||
items, err := s.snder.Collect(s.xml, "")
|
}
|
||||||
assert.NoError(s.T(), err)
|
|
||||||
assert.Equal(s.T(), 141, len(items))
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.label, func(t *testing.T) {
|
||||||
first := items[0]
|
items, err := snder.Collect(string(file), tc.lastsentlink)
|
||||||
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/16h17m07s14/", first.link)
|
assert.NoError(t, err)
|
||||||
assert.ElementsMatch(s.T(), []string{
|
assert.Equal(t, tc.expectedRssItems, len(items))
|
||||||
"https://fosstodon.org/@celia",
|
|
||||||
"https://fosstodon.org/@kev",
|
if tc.expectedRssItems > 0 {
|
||||||
"/about",
|
last := items[len(items)-1]
|
||||||
}, first.hrefs)
|
assert.ElementsMatch(t, tc.expectedLastLinks, last.hrefs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue