From 3f1ece1c39b66ffa7a132766e2644bb4241c995d Mon Sep 17 00:00:00 2001 From: wgroeneveld Date: Tue, 26 Apr 2022 14:44:08 +0200 Subject: [PATCH] fix HREF parsing to limit to tags only (case insensitive) --- app/webmention/send/discoverer_test.go | 2 +- app/webmention/send/rsslinkcollector.go | 4 +- app/webmention/send/rsslinkcollector_test.go | 215 ++++++++++++------- 3 files changed, 140 insertions(+), 81 deletions(-) diff --git a/app/webmention/send/discoverer_test.go b/app/webmention/send/discoverer_test.go index 339b62b..8949f40 100644 --- a/app/webmention/send/discoverer_test.go +++ b/app/webmention/send/discoverer_test.go @@ -16,7 +16,7 @@ func TestDiscoverRssFeedE2EBrainbaking(t *testing.T) { } link, err := sender.discoverRssFeed("brainbaking.com") - expectedUrl := "https://brainbaking.com/index.xml" + expectedUrl := "https://brainbaking.com/all/index.xml" assert.Nil(t, err) assert.Truef(t, strings.HasPrefix(link, expectedUrl), "should start with %s, but was %s", expectedUrl, link) } diff --git a/app/webmention/send/rsslinkcollector.go b/app/webmention/send/rsslinkcollector.go index c1abfea..050e538 100644 --- a/app/webmention/send/rsslinkcollector.go +++ b/app/webmention/send/rsslinkcollector.go @@ -56,8 +56,8 @@ func (snder *Sender) Collect(xml string, lastSentLink string) ([]RSSItem, error) } var ( - hrefRegexp = regexp.MustCompile(`href="(.+?)"`) - extRegexp = regexp.MustCompile(`\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`) + hrefRegexp = regexp.MustCompile(`(?i)sup`, + []string{}, + }, + { + "should not collect blacklisted links", + `sup and also dinges!`, + []string{ + "/dinges", + }, + }, + { + "should not collect hrefs from tags, only from ones", + `dinges!`, + []string{ + "/dinges", + }, + }, + { + "should collect even if href is not the first attribute of an tag", + `one and two and three`, + []string{ + "/one", + "/two", + "/three", + }, + }, + { + "should collect case insensitive", + `one and two and three`, + []string{ + "/one", + "/two", + "/three", + }, + }, + { + "should not collect zips or ZIPs or gifs or GIFS", + `cool gif and more-cool gif and here's a zip: baf or boef.zip??'`, + []string{}, + }, + } + + s := &Sender{ + Conf: &common.Config{ + Blacklist: []string{ + "blacklisted.com", + }, + }, + } + for _, tc := range cases { + t.Run(tc.label, func(t *testing.T) { + result := s.collectUniqueHrefsFromHtml(tc.html) + assert.ElementsMatch(t, tc.expectedLinks, result) + }) + } } -func (s *CollectSuite) SetupTest() { +func TestCollect(t *testing.T) { file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml") - s.xml = string(file) - s.snder = &Sender{ + snder := &Sender{ Conf: &common.Config{ Blacklist: []string{ "youtube.com", }, }, } -} - -func TestCollectSuite(t *testing.T) { - suite.Run(t, new(CollectSuite)) -} - -func (s *CollectSuite) TestCollectUniqueHrefsFromHtmlShouldNotContainInlineLinks() { - links := s.snder.collectUniqueHrefsFromHtml(`sup`) - assert.Empty(s.T(), links) -} - -func (s *CollectSuite) TestCollectShouldNotContainHrefsFromBlockedDomains() { - items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/09h15m17s30/") - assert.NoError(s.T(), err) - last := items[len(items)-1] - assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/10h16m24s22/", last.link) - assert.ElementsMatch(s.T(), []string{ - "https://dog.estate/@eli_oat", - "https://twitter.com/olesovhcom/status/1369478732247932929", - "/about", - }, last.hrefs) -} - -func (s *CollectSuite) TestCollectShouldNotContainHrefsThatPointToImages() { - items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/13h12m44s29/") - assert.NoError(s.T(), err) - last := items[len(items)-1] - // test case: - // contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg - assert.ElementsMatch(s.T(), []string{ - "/about", - }, last.hrefs) -} - -func (s *CollectSuite) TestCollectNothingIfNothingNewInFeed() { - latestEntry := "https://brainbaking.com/notes/2021/03/16h17m07s14/" - items, err := s.snder.Collect(s.xml, latestEntry) - assert.NoError(s.T(), err) - assert.Equal(s.T(), 0, len(items)) -} - -func (s *CollectSuite) TestCollectLatestXLinksWhenARecentLinkParameterIsProvided() { - items, err := s.snder.Collect(s.xml, "https://brainbaking.com/notes/2021/03/14h17m41s53/") - assert.NoError(s.T(), err) - assert.Equal(s.T(), 3, len(items)) - - last := items[len(items)-1] - assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/15h14m43s49/", last.link) - assert.ElementsMatch(s.T(), []string{ - "http://replit.com", - "http://codepen.io", - "https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/", - "/about", - }, last.hrefs) - -} - -func (s *CollectSuite) TestCollectEveryExternalLinkWithoutARecentLink() { - items, err := s.snder.Collect(s.xml, "") - assert.NoError(s.T(), err) - assert.Equal(s.T(), 141, len(items)) - - first := items[0] - assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/16h17m07s14/", first.link) - assert.ElementsMatch(s.T(), []string{ - "https://fosstodon.org/@celia", - "https://fosstodon.org/@kev", - "/about", - }, first.hrefs) - + + cases := []struct { + label string + lastsentlink string + expectedRssItems int + expectedLastLinks []string + }{ + { + "should not contain hrefs from blocked domains", + "https://brainbaking.com/notes/2021/03/09h15m17s30/", + 10, + []string{ + "https://dog.estate/@eli_oat", + "https://twitter.com/olesovhcom/status/1369478732247932929", + "/about", + }, + }, + { + // test case: contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg + "should not contain hrefs that point to images", + "https://brainbaking.com/notes/2021/03/13h12m44s29/", + 4, + []string{ + "/about", + }, + }, + { + "collects nothing if nothing new in feed", + "https://brainbaking.com/notes/2021/03/16h17m07s14/", + 0, + []string{}, + }, + { + "collect latest X links when a recent link parameter is provided", + "https://brainbaking.com/notes/2021/03/14h17m41s53/", + 3, + []string{ + "http://replit.com", + "http://codepen.io", + "https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/", + "/about", + }, + }, + { + "collect every external link without a recent link", + "", + 141, + []string{ + "/notes/index.xml", + "/archives", + "/categories/hardware/index.xml", + "/about", + "https://netnewswire.com/", + "/index.xml", + "brainbaking.com", + "/post/index.xml", + }, + }, + } + + for _, tc := range cases { + t.Run(tc.label, func(t *testing.T) { + items, err := snder.Collect(string(file), tc.lastsentlink) + assert.NoError(t, err) + assert.Equal(t, tc.expectedRssItems, len(items)) + + if tc.expectedRssItems > 0 { + last := items[len(items)-1] + assert.ElementsMatch(t, tc.expectedLastLinks, last.hrefs) + } + }) + } }