forked from wgroeneveld/go-jamming
parse <time/> tags if found in description as soft lastmod
This commit is contained in:
parent
5668db8f80
commit
f3e2d1d1c9
|
@ -53,6 +53,10 @@ This does a couple of things:
|
|||
|
||||
As with the `POST` call, will result in a `202 Accepted` and handles things async/in parallel.
|
||||
|
||||
**Does this thing take updates into account**?
|
||||
|
||||
Yes and no. It checks the `<pubDate/>` `<item/>` RSS tag by default, but if a `<time datetime="..."/>` tag is present in the `<description/>`, it treats that date as the "last modified" date. There is no such thing in the RSS 2.0 W3.org specs, so I had to come up with my own hacks! Remember that if you want this to work, you also need to include a time tag in your RSS feed (e.g. `.Lastmod` gitinfo in Hugo).
|
||||
|
||||
## TODOs
|
||||
|
||||
- `published` date is not well-formatted and blindly taken over from feed
|
||||
|
|
|
@ -12,7 +12,10 @@ const parseOpts = {
|
|||
|
||||
function collectHrefsFromDescription(description) {
|
||||
// first thought: use parser.parse() and traverse recursively. turned out to be way too slow.
|
||||
const links = description.match(/href="([^"]*")/g)
|
||||
const linksMatch = description.match(/href="([^"]*")/g)
|
||||
if(!linksMatch) return []
|
||||
|
||||
const links = linksMatch
|
||||
.map(match => match.replace("href=", "").replace(/\"/g, ""))
|
||||
.filter(match => !(/\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$/i).test(match))
|
||||
.filter(match => !config.disallowedWebmentionDomains.some(domain => match.indexOf(domain) >= 0))
|
||||
|
@ -21,6 +24,7 @@ function collectHrefsFromDescription(description) {
|
|||
|
||||
/**
|
||||
* a typical RSS item looks like this:
|
||||
-- if <time/> found in body, assume it's a lastmod update timestamp!
|
||||
{
|
||||
title: '@celia @kev I have read both you and Kev's post on...',
|
||||
link: 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
|
||||
|
@ -38,7 +42,7 @@ function collectHrefsFromDescription(description) {
|
|||
'\n' +
|
||||
'\n' +
|
||||
' <p>\n' +
|
||||
' By <a href="/about">Wouter Groeneveld</a> on 16 March 2021.\n' +
|
||||
' By <a href="/about">Wouter Groeneveld</a> on <time datetime='2021-03-20'>20 March 2021</time>.\n' +
|
||||
' </p>\n' +
|
||||
' '
|
||||
}
|
||||
|
@ -47,18 +51,31 @@ function collect(xml, since = '') {
|
|||
const root = parser.parse(xml, parseOpts).rss.channel
|
||||
const sinceDate = dayjs(since)
|
||||
|
||||
// example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
|
||||
const sincePubDate = (date) => {
|
||||
const enrichWithDateProperties = (item) => {
|
||||
// example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
|
||||
const rawpub = item.pubDate?.split(", ")?.[1]
|
||||
item.pubDate = rawpub ? dayjs(rawpub, "DD MMM YYYY HH:mm:ss ZZ") : dayjs()
|
||||
if(!item.pubDate.isValid()) item.pubDate = dayjs()
|
||||
|
||||
const dateTimeMatch = item.description.match(/datetime="([^"]*")/g)
|
||||
// Selecting the first - could be dangerous. Living on the edge. Don't care. etc.
|
||||
const rawlastmod = dateTimeMatch?.[0]?.replace("datetime=", "")?.replace(/\"/g, "")
|
||||
item.lastmodDate = rawlastmod ? dayjs(rawlastmod) : dayjs(0)
|
||||
|
||||
return item
|
||||
}
|
||||
|
||||
const sincePublicationDate = (item) => {
|
||||
if(!sinceDate.isValid()) return true
|
||||
const pubDate = dayjs(date.split(", ")[1], "DD MMM YYYY HH:mm:ss ZZ")
|
||||
if(!pubDate.isValid()) return true
|
||||
return sinceDate < pubDate
|
||||
|
||||
return sinceDate < (item.lastmodDate > item.pubDate ? item.lastmodDate : item.pubDate)
|
||||
}
|
||||
|
||||
const entries = root.item.filter ? root.item : [root.item]
|
||||
|
||||
return entries
|
||||
.filter(item => sincePubDate(item.pubDate))
|
||||
.map(enrichWithDateProperties)
|
||||
.filter(sincePublicationDate)
|
||||
.map(item => {
|
||||
return {
|
||||
link: item.link,
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
|
||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
<channel>
|
||||
<title>Brain Baking</title>
|
||||
<link>https://brainbaking.com/</link>
|
||||
<description>Recent content on Brain Baking</description>
|
||||
<generator>Hugo -- gohugo.io</generator>
|
||||
<language>en-us</language>
|
||||
<managingEditor>Wouter Groeneveld</managingEditor>
|
||||
<webMaster>Wouter Groeneveld</webMaster>
|
||||
<lastBuildDate>Tue, 16 Mar 2021 17:07:14 +0000</lastBuildDate>
|
||||
|
||||
<atom:link href="https://brainbaking.com/index.xml" rel="self" type="application/rss+xml" />
|
||||
|
||||
|
||||
<item>
|
||||
<title>@celia @kev I have read both you and Kev's post on...</title>
|
||||
<link>https://brainbaking.com/notes/2021/03/16h17m07s14/</link>
|
||||
<comments>https://brainbaking.com/notes/2021/03/16h17m07s14/#commento</comments>
|
||||
<pubDate>Tue, 16 Mar 2021 17:07:14 +0000</pubDate>
|
||||
<author>Wouter Groeneveld</author>
|
||||
<guid isPermaLink="true">https://brainbaking.com/notes/2021/03/16h17m07s14/</guid>
|
||||
|
||||
|
||||
|
||||
<description>
|
||||
<![CDATA[
|
||||
|
||||
|
||||
<p><span class="h-card"><a class="u-url mention" data-user="A5GVjIHI6MH82H6iLQ" href="https://fosstodon.org/@celia" rel="ugc">@<span>celia</span></a></span> <span class="h-card"><a class="u-url mention" data-user="A54b8g0RBaIgjzczMu" href="https://fosstodon.org/@kev" rel="ugc">@<span>kev</span></a></span> I have read both you and Kev’s post on this and agree on some points indeed! But I’m not yet ready to give up webmentions. As an academic, the idea of citing/mentioning each other is very alluring 🤓. Plus, I needed an excuse to fiddle some more with JS… <br><br>As much as I loved using Wordpress before, I can’t imagine going back to writing stuff in there instead of in markdown. Gotta keep the workflow short, though. Hope it helps you focus on what matters - content!</p>
|
||||
|
||||
|
||||
<p>
|
||||
By <a href="/about">Wouter Groeneveld</a> on <time datetime="2021-03-20">20 March 2021</time>.
|
||||
</p>
|
||||
]]>
|
||||
</description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
|
@ -35,6 +35,22 @@ describe("collect RSS links of articles since certain period", () => {
|
|||
])
|
||||
})
|
||||
|
||||
test("collects if time tag found in content that acts as an update stamp", async () => {
|
||||
// sample item: pubDate 2021-03-16, timestamp updated: 2021-03-20
|
||||
xml = (await fs.readFile('./test/__mocks__/samplerss-updated-timestamp.xml')).toString()
|
||||
|
||||
const collected = collect(xml, dayjs('2021-03-19').toDate())
|
||||
expect(collected.length).toBe(1)
|
||||
})
|
||||
|
||||
test("does not collect if time tag found in content but still older than since", async () => {
|
||||
// sample item: pubDate 2021-03-16, timestamp updated: 2021-03-20
|
||||
xml = (await fs.readFile('./test/__mocks__/samplerss-updated-timestamp.xml')).toString()
|
||||
|
||||
const collected = collect(xml, dayjs('2021-03-21').toDate())
|
||||
expect(collected.length).toBe(0)
|
||||
})
|
||||
|
||||
test("collects nothing if date in future and since nothing new in feed", () => {
|
||||
const collected = collect(xml, dayjs().add(7, 'day').toDate())
|
||||
expect(collected.length).toEqual(0)
|
||||
|
|
Loading…
Reference in New Issue