From 044483f646de2d11b9798b9d2355d4d22093c940 Mon Sep 17 00:00:00 2001 From: wgroeneveld Date: Sat, 10 Apr 2021 16:16:08 +0200 Subject: [PATCH] why am I writing my own set implementation? geez --- app/rss/feed.go | 19 +++++----- app/webmention/send/rsslinkcollector.go | 22 ++++++++++- app/webmention/send/rsslinkcollector_test.go | 33 ++++++++-------- common/collections.go | 40 ++++++++++++++++++++ common/config.go | 9 +++++ 5 files changed, 93 insertions(+), 30 deletions(-) create mode 100644 common/collections.go diff --git a/app/rss/feed.go b/app/rss/feed.go index 5209730..4be1d66 100644 --- a/app/rss/feed.go +++ b/app/rss/feed.go @@ -5,7 +5,6 @@ import ( "encoding/xml" "errors" "github.com/rs/zerolog/log" - "html/template" "time" ) @@ -24,13 +23,13 @@ type Rss2 struct { type Item struct { // Required - Title string `xml:"title"` - Link string `xml:"link"` - Description template.HTML `xml:"description"` + Title string `xml:"title"` + Link string `xml:"link"` + Description string `xml:"description"` // could also be template.HTML, not interested in that // Optional - Content template.HTML `xml:"encoded"` - PubDate string `xml:"pubDate"` - Comments string `xml:"comments"` + Content string `xml:"encoded"` + PubDate string `xml:"pubDate"` + Comments string `xml:"comments"` } func (itm Item) PubDateAsTime() time.Time { @@ -62,9 +61,9 @@ type Entry struct { Author Author `xml:"author"` } -func ParseFeed(content []byte) (Rss2, error) { - v := Rss2{} - err := xml.Unmarshal(content, &v) +func ParseFeed(content []byte) (*Rss2, error) { + v := &Rss2{} + err := xml.Unmarshal(content, v) if err != nil { return v, err } diff --git a/app/webmention/send/rsslinkcollector.go b/app/webmention/send/rsslinkcollector.go index e69b16c..e5bcd53 100644 --- a/app/webmention/send/rsslinkcollector.go +++ b/app/webmention/send/rsslinkcollector.go @@ -2,6 +2,8 @@ package send import ( "brainbaking.com/go-jamming/app/rss" + "brainbaking.com/go-jamming/common" + "regexp" "time" ) @@ -35,7 +37,7 @@ type RSSItem struct { ' ' } **/ -func Collect(xml string, since time.Time) ([]RSSItem, error) { +func (snder *Sender) Collect(xml string, since time.Time) ([]RSSItem, error) { feed, err := rss.ParseFeed([]byte(xml)) if err != nil { return nil, err @@ -44,9 +46,25 @@ func Collect(xml string, since time.Time) ([]RSSItem, error) { for _, rssitem := range feed.ItemList { if since.Before(rssitem.PubDateAsTime()) { items = append(items, RSSItem{ - link: rssitem.Link, + link: rssitem.Link, + hrefs: snder.collectUniqueHrefsFromDescription(rssitem.Description), }) } } return items, nil } + +func (snder *Sender) collectUniqueHrefsFromDescription(html string) []string { + r := regexp.MustCompile(`href="(.+?)"`) + ext := regexp.MustCompile(`\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`) + urlmap := common.NewSet() + + for _, match := range r.FindAllStringSubmatch(html, -1) { + url := match[1] // [0] is the match of the entire expression, [1] is the capture group + if !ext.MatchString(url) && !snder.Conf.ContainsDisallowedDomain(url) { + urlmap.Add(url) + } + } + + return urlmap.Keys() +} diff --git a/app/webmention/send/rsslinkcollector_test.go b/app/webmention/send/rsslinkcollector_test.go index cc31ef9..9029a19 100644 --- a/app/webmention/send/rsslinkcollector_test.go +++ b/app/webmention/send/rsslinkcollector_test.go @@ -10,12 +10,20 @@ import ( type CollectSuite struct { suite.Suite - xml string + xml string + snder *Sender } func (s *CollectSuite) SetupTest() { file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml") s.xml = string(file) + s.snder = &Sender{ + Conf: &common.Config{ + DisallowedWebmentionDomains: []string{ + "youtube.com", + }, + }, + } } func TestCollectSuite(t *testing.T) { @@ -23,32 +31,21 @@ func TestCollectSuite(t *testing.T) { } func (s *CollectSuite) TestCollectShouldNotContainHrefsFromBlockedDomains() { - items, err := Collect(s.xml, common.IsoToTime("2021-03-10T00:00:00.000Z")) + items, err := s.snder.Collect(s.xml, common.IsoToTime("2021-03-10T00:00:00.000Z")) assert.NoError(s.T(), err) last := items[len(items)-1] assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/10h16m24s22/", last.link) - /* - assert.Equal(s.T(), []string{ - "https://dog.estate/@eli_oat", - "https://twitter.com/olesovhcom/status/1369478732247932929", - "/aobut", - }, last.hrefs) - - */ + assert.ElementsMatch(s.T(), []string{ + "https://dog.estate/@eli_oat", + "https://twitter.com/olesovhcom/status/1369478732247932929", + "/about", + }, last.hrefs) } func (s *CollectSuite) TestCollectShouldNotContainHrefsThatPointToImages() { } -func (s *CollectSuite) TestCollectIfTimeTagFoundInContextThatActsAsAnUpdateStamp() { - -} - -func (s *CollectSuite) TestCollectsNotIfTimeTagFoundInContextButStillOlderThanSince() { - -} - func (s *CollectSuite) TestCollectNothingIfDateInFutureAndSinceNothingNewInFeed() { } diff --git a/common/collections.go b/common/collections.go new file mode 100644 index 0000000..320d3b9 --- /dev/null +++ b/common/collections.go @@ -0,0 +1,40 @@ +package common + +type EmptySetVal struct{} + +var member EmptySetVal + +type Set struct { + data map[string]EmptySetVal +} + +func NewSet() *Set { + return &Set{ + data: map[string]EmptySetVal{}, + } +} + +func (set *Set) Add(val string) { + set.data[val] = member +} + +func (set *Set) Del(val string) { + delete(set.data, val) +} + +func (set *Set) Len() int { + return len(set.data) +} + +func (set *Set) HasKey(key string) bool { + _, exists := set.data[key] + return exists +} + +func (set *Set) Keys() []string { + keys := make([]string, 0, len(set.data)) + for key := range set.data { + keys = append(keys, key) + } + return keys +} diff --git a/common/config.go b/common/config.go index cd1fd61..f6df1f5 100644 --- a/common/config.go +++ b/common/config.go @@ -18,6 +18,15 @@ type Config struct { DisallowedWebmentionDomains []string } +func (c *Config) ContainsDisallowedDomain(url string) bool { + for _, domain := range c.DisallowedWebmentionDomains { + if strings.Contains(url, domain) { + return true + } + } + return false +} + func (c *Config) IsAnAllowedDomain(url string) bool { for _, domain := range c.AllowedWebmentionSources { if domain == url {