fix HREF parsing to limit to <a/> tags only (case insensitive)

2022-04-26 14:44:08 +02:00 · 2022-04-26 14:44:08 +02:00 · 3f1ece1c39
parent eec2ed69f8
commit 3f1ece1c39
3 changed files with 140 additions and 81 deletions
--- a/app/webmention/send/discoverer_test.go
+++ b/app/webmention/send/discoverer_test.go
@ -16,7 +16,7 @@ func TestDiscoverRssFeedE2EBrainbaking(t *testing.T) {
 	}
 	link, err := sender.discoverRssFeed("brainbaking.com")
-	expectedUrl := "https://brainbaking.com/index.xml"
+	expectedUrl := "https://brainbaking.com/all/index.xml"
 	assert.Nil(t, err)
 	assert.Truef(t, strings.HasPrefix(link, expectedUrl), "should start with %s, but was %s", expectedUrl, link)
 }
--- a/app/webmention/send/rsslinkcollector.go
+++ b/app/webmention/send/rsslinkcollector.go
@ -56,8 +56,8 @@ func (snder *Sender) Collect(xml string, lastSentLink string) ([]RSSItem, error)
 }
 var (
-	hrefRegexp = regexp.MustCompile(`href="(.+?)"`)
+	hrefRegexp = regexp.MustCompile(`(?i)<a.+?href="(.+?)"`)
-	extRegexp  = regexp.MustCompile(`\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`)
+	extRegexp  = regexp.MustCompile(`(?i)\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`)
 )
 func (snder *Sender) collectUniqueHrefsFromHtml(html string) []string {
--- a/app/webmention/send/rsslinkcollector_test.go
+++ b/app/webmention/send/rsslinkcollector_test.go
@ -3,95 +3,154 @@ package send
 import (
 	"brainbaking.com/go-jamming/common"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 	"io/ioutil"
 	"testing"
 )
-type CollectSuite struct {
+func TestCollectUniqueHrefsFromHtml(t *testing.T) {
-	suite.Suite
+	cases := []struct {
-	xml   string
+		label         string
-	snder *Sender
+		html          string
 		expectedLinks []string
 	}{
 		{
 			"should not contain inline links",
 			`<html><body><a href="#inline">sup</a></body></html>`,
 			[]string{},
 		},
 		{
 			"should not collect blacklisted links",
 			`<html><body><a href="https://www.blacklisted.com/wowo.html">sup</a> and also <a href="/dinges">dinges</a>!</body></html>`,
 			[]string{
 				"/dinges",
 			},
 		},
 		{
 			"should not collect hrefs from <link/> tags, only from <a/> ones",
 			`<html><head><link rel="stylesheet" href="/style.css"></head><body><a href="/dinges">dinges</a>!</body></html>`,
 			[]string{
 				"/dinges",
 			},
 		},
 		{
 			"should collect even if href is not the first attribute of an <a> tag",
 			`<html><body><a style="cool" target="_blank" href="/one">one</a> and <a target="_blank" href="/two">two</a> and <a href="/three">three</a></body></html>`,
 			[]string{
 				"/one",
 				"/two",
 				"/three",
 			},
 		},
 		{
 			"should collect case insensitive",
 			`<html><body><A href="/one">one</A> and <a href="/two">two</a> and <a HREF="/three">three</a></body></html>`,
 			[]string{
 				"/one",
 				"/two",
 				"/three",
 			},
 		},
 		{
 			"should not collect zips or ZIPs or gifs or GIFS",
 			`<a href="/cool.gif">cool gif</a> and <a href="/more-cool.GIF">more-cool gif</a> and here's a zip: <a href="baf.ZIP">baf</a> or <a href="boef.zip">boef.zip</a>??'`,
 			[]string{},
 		},
 	}
 	s := &Sender{
 		Conf: &common.Config{
 			Blacklist: []string{
 				"blacklisted.com",
 			},
 		},
 	}
 	for _, tc := range cases {
 		t.Run(tc.label, func(t *testing.T) {
 			result := s.collectUniqueHrefsFromHtml(tc.html)
 			assert.ElementsMatch(t, tc.expectedLinks, result)
 		})
 	}
 }
-func (s *CollectSuite) SetupTest() {
+func TestCollect(t *testing.T) {
 	file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml")
-	s.xml = string(file)
+	snder := &Sender{
 	s.snder = &Sender{
 		Conf: &common.Config{
 			Blacklist: []string{
 				"youtube.com",
 			},
 		},
 	}
-}
+
-
+	cases := []struct {
-func TestCollectSuite(t *testing.T) {
+		label             string
-	suite.Run(t, new(CollectSuite))
+		lastsentlink      string
-}
+		expectedRssItems  int
-
+		expectedLastLinks []string
-func (s *CollectSuite) TestCollectUniqueHrefsFromHtmlShouldNotContainInlineLinks() {
+	}{
-	links := s.snder.collectUniqueHrefsFromHtml(`<html><body><a href="#inline">sup</a></body></html>`)
+		{
-	assert.Empty(s.T(), links)
+			"should not contain hrefs from blocked domains",
-}
+			"https://brainbaking.com/notes/2021/03/09h15m17s30/",
-
+			10,
-func (s *CollectSuite) TestCollectShouldNotContainHrefsFromBlockedDomains() {
+			[]string{
-	items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/09h15m17s30/")
+				"https://dog.estate/@eli_oat",
-	assert.NoError(s.T(), err)
+				"https://twitter.com/olesovhcom/status/1369478732247932929",
-	last := items[len(items)-1]
+				"/about",
-	assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/10h16m24s22/", last.link)
+			},
-	assert.ElementsMatch(s.T(), []string{
+		},
-		"https://dog.estate/@eli_oat",
+		{
-		"https://twitter.com/olesovhcom/status/1369478732247932929",
+			// test case: contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg
-		"/about",
+			"should not contain hrefs that point to images",
-	}, last.hrefs)
+			"https://brainbaking.com/notes/2021/03/13h12m44s29/",
-}
+			4,
-
+			[]string{
-func (s *CollectSuite) TestCollectShouldNotContainHrefsThatPointToImages() {
+				"/about",
-	items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/13h12m44s29/")
+			},
-	assert.NoError(s.T(), err)
+		},
-	last := items[len(items)-1]
+		{
-	// test case:
+			"collects nothing if nothing new in feed",
-	// contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg
+			"https://brainbaking.com/notes/2021/03/16h17m07s14/",
-	assert.ElementsMatch(s.T(), []string{
+			0,
-		"/about",
+			[]string{},
-	}, last.hrefs)
+		},
-}
+		{
-
+			"collect latest X links when a recent link parameter is provided",
-func (s *CollectSuite) TestCollectNothingIfNothingNewInFeed() {
+			"https://brainbaking.com/notes/2021/03/14h17m41s53/",
-	latestEntry := "https://brainbaking.com/notes/2021/03/16h17m07s14/"
+			3,
-	items, err := s.snder.Collect(s.xml, latestEntry)
+			[]string{
-	assert.NoError(s.T(), err)
+				"http://replit.com",
-	assert.Equal(s.T(), 0, len(items))
+				"http://codepen.io",
-}
+				"https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/",
-
+				"/about",
-func (s *CollectSuite) TestCollectLatestXLinksWhenARecentLinkParameterIsProvided() {
+			},
-	items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/14h17m41s53/")
+		},
-	assert.NoError(s.T(), err)
+		{
-	assert.Equal(s.T(), 3, len(items))
+			"collect every external link without a recent link",
-
+			"",
-	last := items[len(items)-1]
+			141,
-	assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/15h14m43s49/", last.link)
+			[]string{
-	assert.ElementsMatch(s.T(), []string{
+				"/notes/index.xml",
-		"http://replit.com",
+				"/archives",
-		"http://codepen.io",
+				"/categories/hardware/index.xml",
-		"https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/",
+				"/about",
-		"/about",
+				"https://netnewswire.com/",
-	}, last.hrefs)
+				"/index.xml",
-
+				"brainbaking.com",
-}
+				"/post/index.xml",
-
+			},
-func (s *CollectSuite) TestCollectEveryExternalLinkWithoutARecentLink() {
+		},
-	items, err := s.snder.Collect(s.xml, "")
+	}
-	assert.NoError(s.T(), err)
+
-	assert.Equal(s.T(), 141, len(items))
+	for _, tc := range cases {
-
+		t.Run(tc.label, func(t *testing.T) {
-	first := items[0]
+			items, err := snder.Collect(string(file), tc.lastsentlink)
-	assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/16h17m07s14/", first.link)
+			assert.NoError(t, err)
-	assert.ElementsMatch(s.T(), []string{
+			assert.Equal(t, tc.expectedRssItems, len(items))
-		"https://fosstodon.org/@celia",
+
-		"https://fosstodon.org/@kev",
+			if tc.expectedRssItems > 0 {
-		"/about",
+				last := items[len(items)-1]
-	}, first.hrefs)
+				assert.ElementsMatch(t, tc.expectedLastLinks, last.hrefs)
-
+			}
 		})
 	}
 }