rss link collecting impl

This commit is contained in:
Wouter Groeneveld 2021-04-10 10:17:38 +02:00
parent d4c854ef81
commit 98695223ca
8 changed files with 298 additions and 0 deletions

View File

@ -1,6 +1,7 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="GoCommentStart" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
<inspection_tool class="GrazieInspection" enabled="false" level="TYPO" enabled_by_default="false" />
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">

82
app/rss/feed.go Normal file
View File

@ -0,0 +1,82 @@
package rss
import (
"brainbaking.com/go-jamming/common"
"encoding/xml"
"errors"
"github.com/rs/zerolog/log"
"html/template"
"time"
)
// someone already did this for me, yay! https://siongui.github.io/2015/03/03/go-parse-web-feed-rss-atom/
type Rss2 struct {
XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"`
// Required
Title string `xml:"channel>title"`
Link string `xml:"channel>link"`
Description string `xml:"channel>description"`
// Optional
PubDate string `xml:"channel>pubDate"`
ItemList []Item `xml:"channel>item"`
}
type Item struct {
// Required
Title string `xml:"title"`
Link string `xml:"link"`
Description template.HTML `xml:"description"`
// Optional
Content template.HTML `xml:"encoded"`
PubDate string `xml:"pubDate"`
Comments string `xml:"comments"`
}
func (itm Item) PubDateAsTime() time.Time {
// format: Tue, 16 Mar 2021 17:07:14 +0000
t, err := time.Parse("Mon, 02 Jan 2006 15:04:05 +0000", itm.PubDate)
if err != nil {
log.Warn().Str("pubDate", itm.PubDate).Msg("Incorrectly formatted RSS date, reverting to now")
return common.Now()
}
return t
}
type Link struct {
Href string `xml:"href,attr"`
}
type Author struct {
Name string `xml:"name"`
Email string `xml:"email"`
}
type Entry struct {
Title string `xml:"title"`
Summary string `xml:"summary"`
Content string `xml:"content"`
Id string `xml:"id"`
Updated string `xml:"updated"`
Link Link `xml:"link"`
Author Author `xml:"author"`
}
func ParseFeed(content []byte) (Rss2, error) {
v := Rss2{}
err := xml.Unmarshal(content, &v)
if err != nil {
return v, err
}
if v.Version == "2.0" {
for i, _ := range v.ItemList {
if v.ItemList[i].Content != "" {
v.ItemList[i].Description = v.ItemList[i].Content
}
}
return v, nil
}
return v, errors.New("not RSS 2.0")
}

34
app/rss/feed_test.go Normal file
View File

@ -0,0 +1,34 @@
package rss
import (
"brainbaking.com/go-jamming/common"
"github.com/stretchr/testify/assert"
"testing"
"time"
)
func TestPubDateAsTimeIncorrectRevertsToNow(t *testing.T) {
common.Now = func() time.Time {
return time.Date(2020, time.January, 1, 12, 30, 0, 0, time.UTC)
}
itm := Item{
PubDate: "frutselbolletjes",
}
theTime := itm.PubDateAsTime()
assert.Equal(t, 2020, theTime.Year())
assert.Equal(t, time.January, theTime.Month())
}
func TestPubDateAsTime(t *testing.T) {
itm := Item{
PubDate: "Tue, 16 Mar 2021 17:07:14 +0000",
}
theTime := itm.PubDateAsTime()
assert.Equal(t, 2021, theTime.Year())
assert.Equal(t, time.March, theTime.Month())
assert.Equal(t, 16, theTime.Day())
assert.Equal(t, 17, theTime.Hour())
assert.Equal(t, 7, theTime.Minute())
assert.Equal(t, 14, theTime.Second())
}

View File

@ -0,0 +1,52 @@
package send
import (
"brainbaking.com/go-jamming/app/rss"
"time"
)
type RSSItem struct {
link string
hrefs []string
}
/**
* a typical RSS item looks like this:
-- if <time/> found in body, assume it's a lastmod update timestamp!
{
title: '@celia @kev I have read both you and Kev&#39;s post on...',
link: 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
comments: 'https://brainbaking.com/notes/2021/03/16h17m07s14/#commento',
pubDate: 'Tue, 16 Mar 2021 17:07:14 +0000',
author: 'Wouter Groeneveld',
guid: {
'#text': 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
'@_isPermaLink': 'true'
},
description: ' \n' +
' \n' +
'\n' +
' <p><span class="h-card"><a class="u-url mention" data-user="A5GVjIHI6MH82H6iLQ" href="https://fosstodon.org/@celia" rel="ugc">@<span>celia</span></a></span> <span class="h-card"><a class="u-url mention" data-user="A54b8g0RBaIgjzczMu" href="https://fosstodon.org/@kev" rel="ugc">@<span>kev</span></a></span> I have read both you and Kev&rsquo;s post on this and agree on some points indeed! But I&rsquo;m not yet ready to give up webmentions. As an academic, the idea of citing/mentioning each other is very alluring 🤓. Plus, I needed an excuse to fiddle some more with JS&hellip; <br><br>As much as I loved using Wordpress before, I can&rsquo;t imagine going back to writing stuff in there instead of in markdown. Gotta keep the workflow short, though. Hope it helps you focus on what matters - content!</p>\n' +
'\n' +
'\n' +
' <p>\n' +
' By <a href="/about">Wouter Groeneveld</a> on <time datetime='2021-03-20'>20 March 2021</time>.\n' +
' </p>\n' +
' '
}
**/
func Collect(xml string, since time.Time) ([]RSSItem, error) {
feed, err := rss.ParseFeed([]byte(xml))
if err != nil {
return nil, err
}
var items []RSSItem
for _, rssitem := range feed.ItemList {
if since.Before(rssitem.PubDateAsTime()) {
items = append(items, RSSItem{
link: rssitem.Link,
})
}
}
return items, nil
}

View File

@ -0,0 +1,62 @@
package send
import (
"brainbaking.com/go-jamming/common"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
"io/ioutil"
"testing"
)
type CollectSuite struct {
suite.Suite
xml string
}
func (s *CollectSuite) SetupTest() {
file, _ := ioutil.ReadFile("../../../mocks/samplerss.xml")
s.xml = string(file)
}
func TestCollectSuite(t *testing.T) {
suite.Run(t, new(CollectSuite))
}
func (s *CollectSuite) TestCollectShouldNotContainHrefsFromBlockedDomains() {
items, err := Collect(s.xml, common.IsoToTime("2021-03-10T00:00:00.000Z"))
assert.NoError(s.T(), err)
last := items[len(items)-1]
assert.Equal(s.T(), "https://brainbaking.com/notes/2021/03/10h16m24s22/", last.link)
/*
assert.Equal(s.T(), []string{
"https://dog.estate/@eli_oat",
"https://twitter.com/olesovhcom/status/1369478732247932929",
"/aobut",
}, last.hrefs)
*/
}
func (s *CollectSuite) TestCollectShouldNotContainHrefsThatPointToImages() {
}
func (s *CollectSuite) TestCollectIfTimeTagFoundInContextThatActsAsAnUpdateStamp() {
}
func (s *CollectSuite) TestCollectsNotIfTimeTagFoundInContextButStillOlderThanSince() {
}
func (s *CollectSuite) TestCollectNothingIfDateInFutureAndSinceNothingNewInFeed() {
}
func (s *CollectSuite) TestCollectLatestXLinksWhenASinceParameterIsProvided() {
}
func (s *CollectSuite) TestCollectEveryExternalLinkWithoutAValidSinceDate() {
}

View File

@ -6,6 +6,7 @@ import (
"brainbaking.com/go-jamming/common"
"brainbaking.com/go-jamming/rest"
"github.com/rs/zerolog/log"
"time"
)
type Sender struct {
@ -15,6 +16,17 @@ type Sender struct {
func (snder *Sender) Send(domain string, since string) {
log.Info().Str("domain", domain).Str("since", since).Msg(` OK: someone wants to send mentions`)
feed, err := snder.RestClient.GetBody("https://" + domain + "/index.xml")
if err != nil {
log.Err(err).Str("domain", domain).Msg("Unable to retrieve RSS feed, aborting send")
return
}
snder.parseRssFeed(feed, common.IsoToTime(since))
}
func (snder *Sender) parseRssFeed(feed string, since time.Time) {
}
func mention() {

View File

@ -5,3 +5,13 @@ import "time"
// https://labs.yulrizka.com/en/stubbing-time-dot-now-in-golang/
// None of the above are very appealing. For now, just use the lazy way.
var Now = time.Now
// since should be in ISO String format, as produced by clients using day.js - e.g. 2021-04-09T15:51:43.732Z
func IsoToTime(since string) time.Time {
layout := "2006-01-02T15:04:05.000Z"
t, err := time.Parse(layout, since)
if err != nil {
return Now()
}
return t
}

45
common/time_test.go Normal file
View File

@ -0,0 +1,45 @@
package common
import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
"testing"
"time"
)
type TimeSuite struct {
suite.Suite
nowtime time.Time
}
func (s *TimeSuite) SetupTest() {
s.nowtime = time.Date(2020, time.January, 1, 12, 30, 0, 0, time.UTC)
Now = func() time.Time {
return s.nowtime
}
}
func TestSendSuite(t *testing.T) {
suite.Run(t, new(TimeSuite))
}
func (s *TimeSuite) TestIsoToTimeInISOString() {
expectedtime := time.Date(2021, time.March, 9, 15, 51, 43, 732, time.UTC)
since := IsoToTime("2021-03-09T15:51:43.732Z")
assert.Equal(s.T(), expectedtime.Year(), since.Year())
assert.Equal(s.T(), expectedtime.Month(), since.Month())
assert.Equal(s.T(), expectedtime.Day(), since.Day())
assert.Equal(s.T(), expectedtime.Hour(), since.Hour())
assert.Equal(s.T(), expectedtime.Minute(), since.Minute())
assert.Equal(s.T(), expectedtime.Second(), since.Second())
}
func (s *TimeSuite) TestIsoToTimeInvalidStringReturnsNow() {
since := IsoToTime("woef ik ben een hondje")
assert.Equal(s.T(), s.nowtime, since)
}
func (s *TimeSuite) TestIsoToTimeEmptyReturnsNow() {
since := IsoToTime("")
assert.Equal(s.T(), s.nowtime, since)
}