parse <time/> tags if found in description as soft lastmod

2021-03-22 20:26:03 +01:00 · 2021-03-22 20:26:03 +01:00 · f3e2d1d1c9
parent 5668db8f80
commit f3e2d1d1c9
4 changed files with 85 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -53,6 +53,10 @@ This does a couple of things:

 As with the `POST` call, will result in a `202 Accepted` and handles things async/in parallel. 

+**Does this thing take updates into account**?
+
+Yes and no. It checks the `<pubDate/>` `<item/>` RSS tag by default, but if a `<time datetime="..."/>` tag is present in the `<description/>`, it treats that date as the "last modified" date. There is no such thing in the RSS 2.0 W3.org specs, so I had to come up with my own hacks! Remember that if you want this to work, you also need to include a time tag in your RSS feed (e.g. `.Lastmod` gitinfo in Hugo). 
+
 ## TODOs

 - `published` date is not well-formatted and blindly taken over from feed
--- a/src/webmention/rsslinkcollector.js
+++ b/src/webmention/rsslinkcollector.js
@ -12,7 +12,10 @@ const parseOpts = {

 function collectHrefsFromDescription(description) {
 	// first thought: use parser.parse() and traverse recursively. turned out to be way too slow.
-	const links = description.match(/href="([^"]*")/g)
+	const linksMatch = description.match(/href="([^"]*")/g)
+  if(!linksMatch) return []
+
+  const links = linksMatch
 		.map(match => match.replace("href=", "").replace(/\"/g, ""))
 		.filter(match => !(/\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$/i).test(match))
 		.filter(match => !config.disallowedWebmentionDomains.some(domain => match.indexOf(domain) >= 0))
@ -21,6 +24,7 @@ function collectHrefsFromDescription(description) {

 /**
 * a typical RSS item looks like this:
+-- if <time/> found in body, assume it's a lastmod update timestamp!
 {
    title: '@celia @kev I have read both you and Kev&#39;s post on...',
    link: 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
@ -38,7 +42,7 @@ function collectHrefsFromDescription(description) {
      '\n' +
      '\n' +
      '          <p>\n' +
-      '            By <a href="/about">Wouter Groeneveld</a> on 16 March 2021.\n' +
+      '            By <a href="/about">Wouter Groeneveld</a> on <time datetime='2021-03-20'>20 March 2021</time>.\n' +
      '          </p>\n' +
      '          '
  }
@ -47,18 +51,31 @@ function collect(xml, since = '') {
  const root = parser.parse(xml, parseOpts).rss.channel
  const sinceDate = dayjs(since)

-  // example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
-  const sincePubDate = (date) => {
+  const enrichWithDateProperties = (item) => {
+    // example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
+    const rawpub = item.pubDate?.split(", ")?.[1]
+    item.pubDate = rawpub ? dayjs(rawpub, "DD MMM YYYY HH:mm:ss ZZ") : dayjs()
+    if(!item.pubDate.isValid()) item.pubDate = dayjs()
+
+    const dateTimeMatch = item.description.match(/datetime="([^"]*")/g)
+    // Selecting the first - could be dangerous. Living on the edge. Don't care. etc. 
+    const rawlastmod = dateTimeMatch?.[0]?.replace("datetime=", "")?.replace(/\"/g, "")
+    item.lastmodDate = rawlastmod ? dayjs(rawlastmod) : dayjs(0)
+
+    return item
+  }
+
+  const sincePublicationDate = (item) => {
  	if(!sinceDate.isValid()) return true
-  	const pubDate = dayjs(date.split(", ")[1], "DD MMM YYYY HH:mm:ss ZZ")
-  	if(!pubDate.isValid()) return true
-  	return sinceDate < pubDate
+  	
+    return sinceDate < (item.lastmodDate > item.pubDate ? item.lastmodDate : item.pubDate)
  }

  const entries = root.item.filter ? root.item : [root.item]

  return entries
-  	.filter(item => sincePubDate(item.pubDate))
+    .map(enrichWithDateProperties)
+  	.filter(sincePublicationDate)
  	.map(item => {
  	return {
  		link: item.link,
--- a/test/mocks/samplerss-updated-timestamp.xml
+++ b/test/mocks/samplerss-updated-timestamp.xml
@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+  <channel>
+    <title>Brain Baking</title>
+    <link>https://brainbaking.com/</link>
+    <description>Recent content on Brain Baking</description>
+    <generator>Hugo -- gohugo.io</generator>
+    <language>en-us</language>
+    <managingEditor>Wouter Groeneveld</managingEditor>
+    <webMaster>Wouter Groeneveld</webMaster>
+    <lastBuildDate>Tue, 16 Mar 2021 17:07:14 +0000</lastBuildDate>
+    
+	<atom:link href="https://brainbaking.com/index.xml" rel="self" type="application/rss+xml" />
+    
+    
+    <item>
+      <title>@celia @kev I have read both you and Kev&#39;s post on...</title>
+      <link>https://brainbaking.com/notes/2021/03/16h17m07s14/</link>
+      <comments>https://brainbaking.com/notes/2021/03/16h17m07s14/#commento</comments>
+      <pubDate>Tue, 16 Mar 2021 17:07:14 +0000</pubDate>
+      <author>Wouter Groeneveld</author>
+      <guid isPermaLink="true">https://brainbaking.com/notes/2021/03/16h17m07s14/</guid>
+      
+
+      
+      <description>
+          <![CDATA[ 
+          
+
+          <p><span class="h-card"><a class="u-url mention" data-user="A5GVjIHI6MH82H6iLQ" href="https://fosstodon.org/@celia" rel="ugc">@<span>celia</span></a></span> <span class="h-card"><a class="u-url mention" data-user="A54b8g0RBaIgjzczMu" href="https://fosstodon.org/@kev" rel="ugc">@<span>kev</span></a></span> I have read both you and Kev&rsquo;s post on this and agree on some points indeed! But I&rsquo;m not yet ready to give up webmentions. As an academic, the idea of citing/mentioning each other is very alluring 🤓. Plus, I needed an excuse to fiddle some more with JS&hellip; <br><br>As much as I loved using Wordpress before, I can&rsquo;t imagine going back to writing stuff in there instead of in markdown. Gotta keep the workflow short, though. Hope it helps you focus on what matters - content!</p>
+
+
+          <p>
+            By <a href="/about">Wouter Groeneveld</a> on <time datetime="2021-03-20">20 March 2021</time>.
+          </p>
+          ]]>
+      </description>
+    </item>
+  </channel>
+</rss>
--- a/test/webmention/rsslinkcollector.test.js
+++ b/test/webmention/rsslinkcollector.test.js
@ -35,6 +35,22 @@ describe("collect RSS links of articles since certain period", () => {
 		])
 	})

+	test("collects if time tag found in content that acts as an update stamp", async () => {
+		// sample item: pubDate 2021-03-16, timestamp updated: 2021-03-20
+		xml = (await fs.readFile('./test/__mocks__/samplerss-updated-timestamp.xml')).toString()
+
+		const collected = collect(xml, dayjs('2021-03-19').toDate())
+		expect(collected.length).toBe(1)
+	})
+
+	test("does not collect if time tag found in content but still older than since", async () => {
+		// sample item: pubDate 2021-03-16, timestamp updated: 2021-03-20
+		xml = (await fs.readFile('./test/__mocks__/samplerss-updated-timestamp.xml')).toString()
+
+		const collected = collect(xml, dayjs('2021-03-21').toDate())
+		expect(collected.length).toBe(0)
+	})
+
 	test("collects nothing if date in future and since nothing new in feed", () => {
 		const collected = collect(xml, dayjs().add(7, 'day').toDate())
 		expect(collected.length).toEqual(0)