webmention sending endpoint implementation

2021-03-17 18:10:40 +01:00 · 2021-03-17 18:10:40 +01:00 · 632a134fda
parent ae4006469d
commit 632a134fda
20 changed files with 18145 additions and 3 deletions
--- a/.pnp.js
+++ b/.pnp.js
@ -38,6 +38,7 @@ function $$SETUP_STATE(hydrateRuntimeState, basePath) {
          "packageLocation": "./",
          "packageDependencies": [
            ["dayjs", "npm:1.10.4"],
+            ["fast-xml-parser", "npm:3.19.0"],
            ["got", "npm:11.8.2"],
            ["jest", "npm:26.6.3"],
            ["koa", "npm:2.13.1"],
@ -2743,6 +2744,15 @@ function $$SETUP_STATE(hydrateRuntimeState, basePath) {
          "linkType": "HARD",
        }]
      ]],
+      ["fast-xml-parser", [
+        ["npm:3.19.0", {
+          "packageLocation": "./.yarn/cache/fast-xml-parser-npm-3.19.0-b7dcd3a31a-bac00722d0.zip/node_modules/fast-xml-parser/",
+          "packageDependencies": [
+            ["fast-xml-parser", "npm:3.19.0"]
+          ],
+          "linkType": "HARD",
+        }]
+      ]],
      ["fb-watchman", [
        ["npm:2.0.1", {
          "packageLocation": "./.yarn/cache/fb-watchman-npm-2.0.1-30005d50fe-f9ec24592a.zip/node_modules/fb-watchman/",
@ -6082,6 +6092,7 @@ function $$SETUP_STATE(hydrateRuntimeState, basePath) {
          "packageDependencies": [
            ["serve-my-jams", "workspace:."],
            ["dayjs", "npm:1.10.4"],
+            ["fast-xml-parser", "npm:3.19.0"],
            ["got", "npm:11.8.2"],
            ["jest", "npm:26.6.3"],
            ["koa", "npm:2.13.1"],
--- a/README.md
+++ b/README.md
@ -43,4 +43,4 @@ Retrieves a JSON array with relevant webmentions stored for that domain. The tok
 ## TODOs

 - `published` date is not well-formatted and blindly taken over from feed
- [brid.gy](https://brid.gy/) does not send webmentions if no target found, although I'd like these to appear in the [brainbaking.com/notes](https://brainbaking.com/notes) somehow, being syndicated from my Mastodon feed.
+- Implement a Brid.gy-like system that converts links from domains in the config found on [public Mastodon timelines](https://docs.joinmastodon.org/methods/timelines/) into webmentions. (And check if it's ok to only use the public line)
--- a/package.json
+++ b/package.json
@ -18,6 +18,7 @@
  },
  "dependencies": {
    "dayjs": "^1.10.4",
+    "fast-xml-parser": "^3.19.0",
    "got": "^11.8.2",
    "koa": "^2.13.1",
    "koa-body": "^4.2.0",
--- a/src/config.js
+++ b/src/config.js
@ -6,6 +6,11 @@ const allowedWebmentionSources = [
 	"jefklakscodex.com"
 ]

+// do NOT send out webmentions to any of these domains.
+const disallowedWebmentionDomains = [
+	"youtube.com"
+]
+
 function setupDataDirs() {
 	allowedWebmentionSources.forEach(domain => {
 		const dir = `data/${domain}`
@ -27,5 +32,6 @@ module.exports = {
 	utcOffset: 60,

 	allowedWebmentionSources,
+	disallowedWebmentionDomains,
 	setupDataDirs
 }
--- a/src/webmention/linkdiscoverer.js
+++ b/src/webmention/linkdiscoverer.js
@ -0,0 +1,37 @@
+const got = require('got')
+const { mf2 } = require("microformats-parser");
+
+const baseUrlOf = (url) => {
+	if(url.match(/\//g).length <= 2) {
+		return url
+	}
+	const split = url.split('/')
+	return split[0] + '//' + split[2]
+}
+
+// see https://www.w3.org/TR/webmention/#sender-discovers-receiver-webmention-endpoint
+async function discover(target) {
+	try {
+		const endpoint = await got(target)
+		if(endpoint.headers.link?.indexOf("webmention") >= 0) {
+			// e.g. Link: <http://aaronpk.example/webmention-endpoint>; rel="webmention"
+			return endpoint.headers.link
+				.split(";")[0]
+				.replace("<" ,"")
+				.replace(">", "")
+		}
+
+		const format = mf2(endpoint.body, {
+			// this also complies with w3.org regulations: relative endpoint could be possible
+			baseUrl: baseUrlOf(target)
+		})
+		return format.rels?.webmention?.[0]
+	} catch(err) {
+		console.warn(` -- whoops, failed to discover ${target}, why: ${err}`)
+		return undefined
+	}
+}
+
+module.exports = {
+	discover
+}
--- a/src/webmention/route.js
+++ b/src/webmention/route.js
@ -1,6 +1,7 @@

 const webmentionReceiver = require('./receive')
 const webmentionLoader = require('./loader')
+const webmentionSender = require('./send')

 function route(router) {
 	router.post("webmention receive endpoint", "/webmention", async (ctx) => {
@ -12,10 +13,23 @@ function route(router) {
 		// we do NOT await this on purpose.
 		webmentionReceiver.receive(ctx.request.body)

-	    ctx.body = "Thanks, bro. Will process this webmention soon, pinky swear!";
+	    ctx.body = "Thanks, bro. Will process this webmention soon, pinky swear!"
 	    ctx.status = 202
 	});

+	router.put("webmention send endpoint", "/webmention/:domain/:token", async (ctx) => {
+		if(!webmentionLoader.validate(ctx.params)) {
+			ctx.throw(403, "access denied")
+		}
+
+		console.log(` OK: someone wants to send mentions from domain ${ctx.params.domain}`)
+		// we do NOT await this on purpose.
+		webmentionSender.send(ctx.params.domain, ctx.request.query?.since)
+
+		ctx.body = "Thanks, bro. Will send these webmentions soon, pinky swear!"
+		ctx.status = 202
+	})
+
 	router.get("webmention get endpoint", "/webmention/:domain/:token", async (ctx) => {
 		if(!webmentionLoader.validate(ctx.params)) {
 			ctx.throw(403, "access denied")
--- a/src/webmention/rsslinkcollector.js
+++ b/src/webmention/rsslinkcollector.js
@ -0,0 +1,72 @@
+
+const parser = require("fast-xml-parser")
+const config = require('./../config')
+
+const dayjs = require('dayjs')
+const customParseFormat = require('dayjs/plugin/customParseFormat')
+dayjs.extend(customParseFormat)
+
+const parseOpts = {
+    ignoreAttributes: false
+}
+
+function collectHrefsFromDescription(description) {
+	// first thought: use parser.parse() and traverse recursively. turned out to be way too slow.
+	const links = description.match(/href="([^"]*")/g)
+		.map(match => match.replace("href=", "").replaceAll("\"", ""))
+		.filter(match => !(/\.(gif|zip|rar|bz2|gz|7z|jpe?g|tiff?|png|webp|bmp)$/i).test(match))
+		.filter(match => !config.disallowedWebmentionDomains.some(domain => match.indexOf(domain) >= 0))
+	return [...new Set(links)]
+}
+
+/**
+* a typical RSS item looks like this:
+ {
+    title: '@celia @kev I have read both you and Kev&#39;s post on...',
+    link: 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
+    comments: 'https://brainbaking.com/notes/2021/03/16h17m07s14/#commento',
+    pubDate: 'Tue, 16 Mar 2021 17:07:14 +0000',
+    author: 'Wouter Groeneveld',
+    guid: {
+      '#text': 'https://brainbaking.com/notes/2021/03/16h17m07s14/',
+      '@_isPermaLink': 'true'
+    },
+    description: ' \n' +
+      '          \n' +
+      '\n' +
+      '          <p><span class="h-card"><a class="u-url mention" data-user="A5GVjIHI6MH82H6iLQ" href="https://fosstodon.org/@celia" rel="ugc">@<span>celia</span></a></span> <span class="h-card"><a class="u-url mention" data-user="A54b8g0RBaIgjzczMu" href="https://fosstodon.org/@kev" rel="ugc">@<span>kev</span></a></span> I have read both you and Kev&rsquo;s post on this and agree on some points indeed! But I&rsquo;m not yet ready to give up webmentions. As an academic, the idea of citing/mentioning each other is very alluring 🤓. Plus, I needed an excuse to fiddle some more with JS&hellip; <br><br>As much as I loved using Wordpress before, I can&rsquo;t imagine going back to writing stuff in there instead of in markdown. Gotta keep the workflow short, though. Hope it helps you focus on what matters - content!</p>\n' +
+      '\n' +
+      '\n' +
+      '          <p>\n' +
+      '            By <a href="/about">Wouter Groeneveld</a> on 16 March 2021.\n' +
+      '          </p>\n' +
+      '          '
+  }
+**/ 
+function collect(xml, since = '') {
+  const root = parser.parse(xml, parseOpts).rss.channel
+  const sinceDate = dayjs(since)
+
+  // example pubDate format: Tue, 16 Mar 2021 17:07:14 +0000
+  const sincePubDate = (date) => {
+  	if(!sinceDate.isValid()) return true
+  	const pubDate = dayjs(date.split(", ")[1], "DD MMM YYYY HH:mm:ss ZZ")
+  	if(!pubDate.isValid()) return true
+  	return sinceDate < pubDate
+  }
+
+  const entries = root.item.filter ? root.item : [root.item]
+
+  return entries
+  	.filter(item => sincePubDate(item.pubDate))
+  	.map(item => {
+  	return {
+  		link: item.link,
+  		hrefs: collectHrefsFromDescription(item.description)
+  	}
+  })
+}
+
+module.exports = {
+	collect
+}
--- a/src/webmention/send.js
+++ b/src/webmention/send.js
@ -0,0 +1,50 @@
+
+const got = require('got')
+const { collect } = require('./rsslinkcollector')
+const { discover } = require('./linkdiscoverer')
+
+async function mention(opts) {
+	const { source, target } = opts
+	const endpoint = await discover(target)
+	if(endpoint) {
+		await got.post(endpoint, {
+			contentType: "x-www-form-urlencoded",
+			form: {
+				source,
+				target
+			},
+			retry: {
+				limit: 5,
+				methods: ["POST"]
+			}
+		})
+		console.log(` OK: webmention@${endpoint}, sent: source ${source}, target ${target}`)
+	}
+}
+
+async function parseRssFeed(xml, since) {
+	const linksToMention = collect(xml, since)
+		.map(el => el.hrefs
+			// this strips relative URLs; could be a feature to also send these to own domain?
+			.filter(href => href.startsWith('http'))
+			.map(href => {
+			return {
+				// SOURCE is own domain this time, TARGET = outbound
+				target: href,
+				source: el.link
+			}
+		}))
+		.flat()
+
+	await Promise.all(linksToMention.map(mention))
+}
+
+
+async function send(domain, since) {
+	const feed = await got(`https://${domain}/index.xml`)
+	await parseRssFeed(feed.body, since)
+}
+
+module.exports = {
+	send
+}
--- a/test/mocks/got.js
+++ b/test/mocks/got.js
@ -2,10 +2,24 @@ const fs = require('fs').promises

 async function got(url) {
 	const relativeUrl = url.replace('https://brainbaking.com/', '')
-	const body = await fs.readFile(`./test/__mocks__/${relativeUrl}`, 'utf8')
+	const body = (await fs.readFile(`./test/__mocks__/${relativeUrl}`, 'utf8')).toString()
+
+	let headers = {}
+	try {
+		headerFile = await fs.readFile(`./test/__mocks__/${relativeUrl.replace(".html", "")}-headers.json`, 'utf8')
+		headers = JSON.parse(headerFile.toString())
+	} catch {
+	}
+	
 	return {
+		headers,
 		body
 	}
 }

+async function gotPostMock(url, opts) {
+}
+
+got.post = gotPostMock
+
 module.exports = got
--- a/test/mocks/index.xml
+++ b/test/mocks/index.xml
@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+  <channel>
+    <title>Brain Baking</title>
+    <link>https://brainbaking.com/</link>
+    <description>Recent content on Brain Baking</description>
+    <generator>Hugo -- gohugo.io</generator>
+    <language>en-us</language>
+    <managingEditor>Wouter Groeneveld</managingEditor>
+    <webMaster>Wouter Groeneveld</webMaster>
+    <lastBuildDate>Tue, 16 Mar 2021 17:07:14 +0000</lastBuildDate>
+    
+  <atom:link href="https://brainbaking.com/index.xml" rel="self" type="application/rss+xml" />
+    
+    
+    <item>
+      <title>@celia @kev I have read both you and Kev&#39;s post on...</title>
+      <link>https://brainbaking.com/notes/2021/03/16h17m07s14/</link>
+      <comments>https://brainbaking.com/notes/2021/03/16h17m07s14/#commento</comments>
+      <pubDate>Tue, 16 Mar 2021 17:07:14 +0000</pubDate>
+      <author>Wouter Groeneveld</author>
+      <guid isPermaLink="true">https://brainbaking.com/notes/2021/03/16h17m07s14/</guid>
+      
+
+      
+      <description>
+          <![CDATA[ 
+          
+          <p>hi there! test discovering: <a href="https://brainbaking.com/link-discover-test-single.html">single</a>. Nice!</p>
+
+          <p>another cool link: <a href="https://brainbaking.com/link-discover-test-multiple.html">multiple</a></p>
+          
+          ]]>
+      </description>
+    </item>
+  </channel>
+</rss>
--- a/test/mocks/link-discover-test-headers.json
+++ b/test/mocks/link-discover-test-headers.json
@ -0,0 +1,3 @@
+{
+	"link": "<http://aaronpk.example/webmention-endpoint>; rel=\"webmention\""
+}
--- a/test/mocks/link-discover-test-multiple.html
+++ b/test/mocks/link-discover-test-multiple.html
@ -0,0 +1,12 @@
+<html>
+<head>
+...
+<link href="http://aaronpk.example/webmention-endpoint-header" rel="webmention" />
+...
+</head>
+<body>
+....
+<a href="http://aaronpk.example/webmention-endpoint-body" rel="webmention">webmention</a>
+...
+</body>
+</html>
--- a/test/mocks/link-discover-test-none.html
+++ b/test/mocks/link-discover-test-none.html
@ -0,0 +1,7 @@
+<html>
+<head>
+</head>
+<body>
+bla
+</body>
+</html>
--- a/test/mocks/link-discover-test-single.html
+++ b/test/mocks/link-discover-test-single.html
@ -0,0 +1,11 @@
+<html>
+<head>
+...
+...
+</head>
+<body>
+....
+<a href="http://aaronpk.example/webmention-endpoint-body" rel="webmention">webmention</a>
+...
+</body>
+</html>
--- a/test/mocks/link-discover-test.html
+++ b/test/mocks/link-discover-test.html
@ -0,0 +1,12 @@
+<html>
+<head>
+...
+<link href="http://aaronpk.example/webmention-endpoint-header" rel="webmention" />
+...
+</head>
+<body>
+....
+<a href="http://aaronpk.example/webmention-endpoint-body" rel="webmention">webmention</a>
+...
+</body>
+</html>
--- a/test/mocks/samplerss.xml
+++ b/test/mocks/samplerss.xml
--- a/test/webmention/linkdiscoverer.test.js
+++ b/test/webmention/linkdiscoverer.test.js
@ -0,0 +1,26 @@
+
+const { discover } = require('../../src/webmention/linkdiscoverer')
+
+describe("link discoverer", () => {
+
+	test("discover link if present in header", async () => {
+		const result = await discover("https://brainbaking.com/link-discover-test.html")
+		expect(result).toBe("http://aaronpk.example/webmention-endpoint")
+	})
+
+	test("discover nothing if no webmention link is present", async() => {
+		const result = await discover("https://brainbaking.com/link-discover-test-none.html")
+		expect(result).toBeUndefined()
+	})
+
+	test("discover link if sole entry somewhere in html", async () => {
+		const result = await discover("https://brainbaking.com/link-discover-test-single.html")
+		expect(result).toBe("http://aaronpk.example/webmention-endpoint-body")
+	})
+
+	test("use link in header if multiple present in html", async () => {
+		const result = await discover("https://brainbaking.com/link-discover-test-multiple.html")
+		expect(result).toBe("http://aaronpk.example/webmention-endpoint-header")
+	})
+
+})
--- a/test/webmention/rsslinkcollector.test.js
+++ b/test/webmention/rsslinkcollector.test.js
@ -0,0 +1,70 @@
+
+const { collect } = require('../../src/webmention/rsslinkcollector')
+const fs = require('fs').promises
+const dayjs = require('dayjs')
+
+describe("collect RSS links of articles since certain period", () => {
+
+	let xml = ''
+	beforeEach(async () => {
+		xml = (await fs.readFile('./test/__mocks__/samplerss.xml')).toString()
+	})
+
+	test("collect should not contain hrefs from blocked domains", () => {
+		const collected = collect(xml, dayjs('2021-03-10T00:00:00.000Z').toDate())
+
+		// test case: 
+		// contains youtube.com/cool link
+		const last = collected[collected.length - 1]
+		expect(last.hrefs).toEqual([
+			"https://dog.estate/@eli_oat",
+			"https://twitter.com/olesovhcom/status/1369478732247932929",
+			"/about"
+		])
+
+	})
+
+	test("collect should not contain hrefs that point to images", () => {
+		const collected = collect(xml, dayjs('2021-03-14T00:00:00.000Z').toDate())
+
+		// test case: 
+		// contains e.g. https://chat.brainbaking.com/media/6f8b72ca-9bfb-460b-9609-c4298a8cab2b/EuropeBattle%202021-03-14%2016-20-36-87.jpg
+		const last = collected[collected.length - 1]
+		expect(last.hrefs).toEqual([
+			"/about"
+		])
+	})
+
+	test("collects nothing if date in future and since nothing new in feed", () => {
+		const collected = collect(xml, dayjs().add(7, 'day').toDate())
+		expect(collected.length).toEqual(0)
+	})
+
+	test("collect latest x links when a since parameter is provided", () => {
+		const collected = collect(xml, dayjs('2021-03-15T00:00:00.000Z').toDate())
+		expect(collected.length).toEqual(3)
+
+		const last = collected[collected.length - 1]
+		expect(last.link).toBe("https://brainbaking.com/notes/2021/03/15h14m43s49/")
+		expect(last.hrefs).toEqual([
+			"http://replit.com",
+			"http://codepen.io",
+			"https://kuleuven-diepenbeek.github.io/osc-course/ch1-c/intro/",
+			"/about"
+		])
+	})
+
+	test("collect every external link without a valid since date", () => {
+		const collected = collect(xml)
+		expect(collected.length).toEqual(141)
+
+		const first = collected[0]
+		expect(first.link).toBe("https://brainbaking.com/notes/2021/03/16h17m07s14/")
+		expect(first.hrefs).toEqual([
+			"https://fosstodon.org/@celia",
+			"https://fosstodon.org/@kev",
+			"/about"
+		])
+	})
+
+})
--- a/test/webmention/send.test.js
+++ b/test/webmention/send.test.js
@ -0,0 +1,39 @@
+
+const got = require('got')
+
+const { send } = require('../../src/webmention/send')
+
+
+describe("webmention send scenarios", () => {
+	test("webmention send integration test", async () => {
+		got.post = jest.fn()
+
+		// fetches index.xml
+		await send("brainbaking.com", '2021-03-16T16:00:00.000Z')
+
+		expect(got.post).toHaveBeenCalledTimes(2)
+		expect(got.post).toHaveBeenCalledWith("http://aaronpk.example/webmention-endpoint-header", {
+			contentType: "x-www-form-urlencoded",
+			form: {
+				source: "https://brainbaking.com/notes/2021/03/16h17m07s14/",
+				target: "https://brainbaking.com/link-discover-test-multiple.html"
+			},
+			retry: {
+				limit: 5,
+				methods: ["POST"]
+			}
+		})
+		expect(got.post).toHaveBeenCalledWith("http://aaronpk.example/webmention-endpoint-body", {
+			contentType: "x-www-form-urlencoded",
+			form: {
+				source: "https://brainbaking.com/notes/2021/03/16h17m07s14/",
+				target: "https://brainbaking.com/link-discover-test-single.html"
+			},
+			retry: {
+				limit: 5,
+				methods: ["POST"]
+			}
+		})
+
+	})
+})
--- a/yarn.lock
+++ b/yarn.lock
@ -2218,6 +2218,15 @@ __metadata:
  languageName: node
  linkType: hard

+"fast-xml-parser@npm:^3.19.0":
+  version: 3.19.0
+  resolution: "fast-xml-parser@npm:3.19.0"
+  bin:
+    xml2js: cli.js
+  checksum: bac00722d00f7f8782ab507281bff3c5cff2b37e2e1e26891a11ac2ac3f0c40e91f545492923d6dc8a57bdf9cfba99518c02ddff380f4ff1e81083d25055e43e
+  languageName: node
+  linkType: hard
+
 "fb-watchman@npm:^2.0.0":
  version: 2.0.1
  resolution: "fb-watchman@npm:2.0.1"
@ -5167,6 +5176,7 @@ fsevents@^2.1.2:
  resolution: "serve-my-jams@workspace:."
  dependencies:
    dayjs: ^1.10.4
+    fast-xml-parser: ^3.19.0
    got: ^11.8.2
    jest: ^26.6.3
    koa: ^2.13.1