2021-04-10 10:17:38 +02:00
|
|
|
package send
|
|
|
|
|
|
|
|
import (
|
|
|
|
"brainbaking.com/go-jamming/app/rss"
|
2021-04-10 16:16:08 +02:00
|
|
|
"brainbaking.com/go-jamming/common"
|
|
|
|
"regexp"
|
2021-04-25 12:18:31 +02:00
|
|
|
"strings"
|
2021-04-10 10:17:38 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
type RSSItem struct {
|
|
|
|
link string
|
|
|
|
hrefs []string
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* a typical RSS item looks like this:
|
|
|
|
-- if <time/> found in body, assume it's a lastmod update timestamp!
|
|
|
|
{
|
|
|
|
title: '@celia @kev I have read both you and Kev's post on...',
|
|
|
|
link: 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
|
|
|
|
comments: 'https://brainbaking.com/notes/2021/03/16h17m07s14/#commento',
|
|
|
|
pubDate: 'Tue, 16 Mar 2021 17:07:14 +0000',
|
|
|
|
author: 'Wouter Groeneveld',
|
|
|
|
guid: {
|
|
|
|
'#text': 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
|
|
|
|
'@_isPermaLink': 'true'
|
|
|
|
},
|
|
|
|
description: ' \n' +
|
|
|
|
' \n' +
|
|
|
|
'\n' +
|
|
|
|
' <p><span class="h-card"><a class="u-url mention" data-user="A5GVjIHI6MH82H6iLQ" href="https://fosstodon.org/@celia" rel="ugc">@<span>celia</span></a></span> <span class="h-card"><a class="u-url mention" data-user="A54b8g0RBaIgjzczMu" href="https://fosstodon.org/@kev" rel="ugc">@<span>kev</span></a></span> I have read both you and Kev’s post on this and agree on some points indeed! But I’m not yet ready to give up webmentions. As an academic, the idea of citing/mentioning each other is very alluring 🤓. Plus, I needed an excuse to fiddle some more with JS… <br><br>As much as I loved using Wordpress before, I can’t imagine going back to writing stuff in there instead of in markdown. Gotta keep the workflow short, though. Hope it helps you focus on what matters - content!</p>\n' +
|
|
|
|
'\n' +
|
|
|
|
'\n' +
|
|
|
|
' <p>\n' +
|
|
|
|
' By <a href="/about">Wouter Groeneveld</a> on <time datetime='2021-03-20'>20 March 2021</time>.\n' +
|
|
|
|
' </p>\n' +
|
|
|
|
' '
|
|
|
|
}
|
|
|
|
**/
|
2021-05-02 09:41:13 +02:00
|
|
|
func (snder *Sender) Collect(xml string, lastSentLink string) ([]RSSItem, error) {
|
2021-04-10 10:17:38 +02:00
|
|
|
feed, err := rss.ParseFeed([]byte(xml))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
var items []RSSItem
|
|
|
|
for _, rssitem := range feed.ItemList {
|
2021-05-02 09:41:13 +02:00
|
|
|
if rssitem.Link == lastSentLink {
|
|
|
|
break
|
2021-04-10 10:17:38 +02:00
|
|
|
}
|
2021-05-02 09:41:13 +02:00
|
|
|
items = append(items, RSSItem{
|
|
|
|
link: rssitem.Link,
|
|
|
|
hrefs: snder.collectUniqueHrefsFromHtml(rssitem.Description),
|
|
|
|
})
|
2021-04-10 10:17:38 +02:00
|
|
|
}
|
|
|
|
return items, nil
|
|
|
|
}
|
2021-04-10 16:16:08 +02:00
|
|
|
|
2021-04-13 09:10:32 +02:00
|
|
|
var (
|
|
|
|
hrefRegexp = regexp.MustCompile(`href="(.+?)"`)
|
|
|
|
extRegexp = regexp.MustCompile(`\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$`)
|
|
|
|
)
|
|
|
|
|
2021-04-25 12:18:31 +02:00
|
|
|
func (snder *Sender) collectUniqueHrefsFromHtml(html string) []string {
|
2021-04-10 16:16:08 +02:00
|
|
|
urlmap := common.NewSet()
|
|
|
|
|
2021-04-13 09:10:32 +02:00
|
|
|
for _, match := range hrefRegexp.FindAllStringSubmatch(html, -1) {
|
2021-04-10 16:16:08 +02:00
|
|
|
url := match[1] // [0] is the match of the entire expression, [1] is the capture group
|
2021-05-02 11:40:45 +02:00
|
|
|
if !extRegexp.MatchString(url) && !snder.Conf.IsBlacklisted(url) && !strings.HasPrefix(url, "#") {
|
2021-04-10 16:16:08 +02:00
|
|
|
urlmap.Add(url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return urlmap.Keys()
|
|
|
|
}
|