diff --git a/README.md b/README.md
index db2aa6c..c8ec18f 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,10 @@ This does a couple of things:
As with the `POST` call, will result in a `202 Accepted` and handles things async/in parallel.
+**Does this thing take updates into account**?
+
+Yes and no. It checks the `` `` RSS tag by default, but if a `` tag is present in the ``, it treats that date as the "last modified" date. There is no such thing in the RSS 2.0 W3.org specs, so I had to come up with my own hacks! Remember that if you want this to work, you also need to include a time tag in your RSS feed (e.g. `.Lastmod` gitinfo in Hugo).
+
## TODOs
- `published` date is not well-formatted and blindly taken over from feed
diff --git a/src/webmention/rsslinkcollector.js b/src/webmention/rsslinkcollector.js
index 9c47063..6d59c14 100644
--- a/src/webmention/rsslinkcollector.js
+++ b/src/webmention/rsslinkcollector.js
@@ -12,7 +12,10 @@ const parseOpts = {
function collectHrefsFromDescription(description) {
// first thought: use parser.parse() and traverse recursively. turned out to be way too slow.
- const links = description.match(/href="([^"]*")/g)
+ const linksMatch = description.match(/href="([^"]*")/g)
+ if(!linksMatch) return []
+
+ const links = linksMatch
.map(match => match.replace("href=", "").replace(/\"/g, ""))
.filter(match => !(/\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$/i).test(match))
.filter(match => !config.disallowedWebmentionDomains.some(domain => match.indexOf(domain) >= 0))
@@ -21,6 +24,7 @@ function collectHrefsFromDescription(description) {
/**
* a typical RSS item looks like this:
+-- if found in body, assume it's a lastmod update timestamp!
{
title: '@celia @kev I have read both you and Kev's post on...',
link: 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
@@ -38,7 +42,7 @@ function collectHrefsFromDescription(description) {
'\n' +
'\n' +
'
\n' +
' '
}
@@ -47,18 +51,31 @@ function collect(xml, since = '') {
const root = parser.parse(xml, parseOpts).rss.channel
const sinceDate = dayjs(since)
- // example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
- const sincePubDate = (date) => {
+ const enrichWithDateProperties = (item) => {
+ // example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
+ const rawpub = item.pubDate?.split(", ")?.[1]
+ item.pubDate = rawpub ? dayjs(rawpub, "DD MMM YYYY HH:mm:ss ZZ") : dayjs()
+ if(!item.pubDate.isValid()) item.pubDate = dayjs()
+
+ const dateTimeMatch = item.description.match(/datetime="([^"]*")/g)
+ // Selecting the first - could be dangerous. Living on the edge. Don't care. etc.
+ const rawlastmod = dateTimeMatch?.[0]?.replace("datetime=", "")?.replace(/\"/g, "")
+ item.lastmodDate = rawlastmod ? dayjs(rawlastmod) : dayjs(0)
+
+ return item
+ }
+
+ const sincePublicationDate = (item) => {
if(!sinceDate.isValid()) return true
- const pubDate = dayjs(date.split(", ")[1], "DD MMM YYYY HH:mm:ss ZZ")
- if(!pubDate.isValid()) return true
- return sinceDate < pubDate
+
+ return sinceDate < (item.lastmodDate > item.pubDate ? item.lastmodDate : item.pubDate)
}
const entries = root.item.filter ? root.item : [root.item]
return entries
- .filter(item => sincePubDate(item.pubDate))
+ .map(enrichWithDateProperties)
+ .filter(sincePublicationDate)
.map(item => {
return {
link: item.link,
diff --git a/test/__mocks__/samplerss-updated-timestamp.xml b/test/__mocks__/samplerss-updated-timestamp.xml
new file mode 100644
index 0000000..b51d585
--- /dev/null
+++ b/test/__mocks__/samplerss-updated-timestamp.xml
@@ -0,0 +1,40 @@
+
+
+
+ Brain Baking
+ https://brainbaking.com/
+ Recent content on Brain Baking
+ Hugo -- gohugo.io
+ en-us
+ Wouter Groeneveld
+ Wouter Groeneveld
+ Tue, 16 Mar 2021 17:07:14 +0000
+
+
+
+
+
+ @celia @kev I have read both you and Kev's post on...
+ https://brainbaking.com/notes/2021/03/16h17m07s14/
+ https://brainbaking.com/notes/2021/03/16h17m07s14/#commento
+ Tue, 16 Mar 2021 17:07:14 +0000
+ Wouter Groeneveld
+ https://brainbaking.com/notes/2021/03/16h17m07s14/
+
+
+
+
+ @celia@kev I have read both you and Kev’s post on this and agree on some points indeed! But I’m not yet ready to give up webmentions. As an academic, the idea of citing/mentioning each other is very alluring 🤓. Plus, I needed an excuse to fiddle some more with JS…
As much as I loved using Wordpress before, I can’t imagine going back to writing stuff in there instead of in markdown. Gotta keep the workflow short, though. Hope it helps you focus on what matters - content!
+
+
+
+ ]]>
+
+
+
+
diff --git a/test/webmention/rsslinkcollector.test.js b/test/webmention/rsslinkcollector.test.js
index e3f3ede..bbefb7c 100644
--- a/test/webmention/rsslinkcollector.test.js
+++ b/test/webmention/rsslinkcollector.test.js
@@ -35,6 +35,22 @@ describe("collect RSS links of articles since certain period", () => {
])
})
+ test("collects if time tag found in content that acts as an update stamp", async () => {
+ // sample item: pubDate 2021-03-16, timestamp updated: 2021-03-20
+ xml = (await fs.readFile('./test/__mocks__/samplerss-updated-timestamp.xml')).toString()
+
+ const collected = collect(xml, dayjs('2021-03-19').toDate())
+ expect(collected.length).toBe(1)
+ })
+
+ test("does not collect if time tag found in content but still older than since", async () => {
+ // sample item: pubDate 2021-03-16, timestamp updated: 2021-03-20
+ xml = (await fs.readFile('./test/__mocks__/samplerss-updated-timestamp.xml')).toString()
+
+ const collected = collect(xml, dayjs('2021-03-21').toDate())
+ expect(collected.length).toBe(0)
+ })
+
test("collects nothing if date in future and since nothing new in feed", () => {
const collected = collect(xml, dayjs().add(7, 'day').toDate())
expect(collected.length).toEqual(0)