extract context if explicitly mentioned @http... url

This commit is contained in:
Wouter Groeneveld 2021-04-24 20:03:58 +02:00
parent b43eaa347c
commit 513ed565dc
6 changed files with 124 additions and 9 deletions

View File

@ -28,7 +28,7 @@ function convertAtomItemToMd(item, notesdir) {
let mddata = ejs.render(templates.markdown, { item })
if(item.media.length > 0) {
if(item.media?.length > 0) {
mddata += '\n' + ejs.render(templates.enclosures, { images: item.media }, { rmWhitespace: true })
}
@ -44,6 +44,23 @@ function trimIfNeeded(title, count, prefix) {
return prefix + title
}
function detectContext(item, content) {
// format: <thr:in-reply-to ref='https://social.linux.pizza/users/StampedingLonghorn/statuses/105821099684887793' href='https://social.linux.pizza/users/StampedingLonghorn/statuses/105821099684887793'/>
if(item['thr:in-reply-to']) {
return item['thr:in-reply-to']['@_ref']
}
// could also be: manually in text "@[<a href...]"
if(content.indexOf("@<a") >= 0) {
const res = content.match(/@<a\s(.*?)href="(.*?)".*?>/)
if(res.length == 3) {
return res[2]
}
}
return ""
}
// opts:
// notesdir = `${__dirname}/content/notes`
// url = "https://chat.brainbaking.com/users/wouter/feed";
@ -74,24 +91,26 @@ async function parseMastoFeed(options) {
const entries = root.feed.entry.map ? root.feed.entry : [root.feed.entry]
const items = entries.map(item => {
const content = ent.decode(ent.decode(item.content['#text'])) // format: &lt;span class=&quot;h-card....
const date = dayjs.utc(item.published).utcOffset(utcOffset)
const year = date.format("YYYY")
const month = date.format("MM")
const day = date.format("DD")
// format: <thr:in-reply-to ref='https://social.linux.pizza/users/StampedingLonghorn/statuses/105821099684887793' href='https://social.linux.pizza/users/StampedingLonghorn/statuses/105821099684887793'/>
const context = item['thr:in-reply-to'] ? item['thr:in-reply-to']['@_ref'] : ""
const context = detectContext(item, content)
const title = escQuotes(ent.decode(ent.decode(item.title)))
const media = item.link?.filter(l =>
l['@_rel'] === 'enclosure' &&
l['@_type'] === 'image/jpeg').map(l => l['@_href'])
// WHY double decode? &#34; = &amp;#34; - first decode '&', then the other char.'
return {
title: trimIfNeeded(title, titleCount, titlePrefix), // summary (cut-off) of content
content: ent.decode(ent.decode(item.content['#text'])), // format: &lt;span class=&quot;h-card....
content,
url: escQuotes(item.id), // format: https://chat.brainbaking.com/objects/0707fd54-185d-4ee7-9204-be370d57663c
context: escQuotes(context),
contextFromMastodon: item['thr:in-reply-to'],
id: stripBeforeLastSlash(item.id),
media,
hash: `${day}h${date.format("HH")}m${date.format("mm")}s${date.format("ss")}`,
@ -101,7 +120,7 @@ async function parseMastoFeed(options) {
day
}
})
.filter(itm => ignoreReplies ? !itm.context : true)
.filter(itm => ignoreReplies ? !itm.contextFromMastodon : true)
.filter(itm => !notes.includes(`${itm.year}/${itm.month}/${itm.hash}`))
.forEach(itm => convertAtomItemToMd(itm, notesdir))
}

View File

@ -1,7 +1,9 @@
const markdown = `---
source: "<%- item.url %>"
<% if (item.context) { -%>
context: "<%- item.context %>"
<% } -%>
title: "<%- item.title %>"
date: "<%- item.year %>-<%- item.month %>-<%- item.day %>T<%- item.date.format('HH:mm:ss') %>"
---

View File

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="UTF-8"?>
<feed
xmlns="http://www.w3.org/2005/Atom"
xmlns:thr="http://purl.org/syndication/thread/1.0"
xmlns:activity="http://activitystrea.ms/spec/1.0/"
xmlns:poco="http://portablecontacts.net/spec/1.0"
xmlns:ostatus="http://ostatus.org/schema/1.0">
<id>https://chat.brainbaking.com/users/wouter/feed.atom</id>
<title>wouter's timeline</title>
<updated>2021-03-02T16:18:46</updated>
<logo>https://chat.brainbaking.com/media/f39bcd85-5098-45e2-b395-e274b712d512/headshot_2020.jpg</logo>
<link rel="self" href="https://chat.brainbaking.com/users/wouter/feed.atom" type="application/atom+xml"/>
<author>
<id>https://chat.brainbaking.com/users/wouter</id>
<activity:object>http://activitystrea.ms/schema/1.0/person</activity:object>
<uri>https://chat.brainbaking.com/users/wouter</uri>
<poco:preferredUsername>wouter</poco:preferredUsername>
<poco:displayName>Wouter Groeneveld</poco:displayName>
<poco:note>Level 35 Brain Baker. Loving the smell of freshly baked thoughts (and bread) in the morning 🍞. Sometimes convincing others to bake their brain (and bread) too 🧠. </poco:note>
<summary>Level 35 Brain Baker. Loving the smell of freshly baked thoughts (and bread) in the morning 🍞. Sometimes convincing others to bake their brain (and bread) too 🧠. </summary>
<name>wouter</name>
<link rel="avatar" href="https://chat.brainbaking.com/media/f39bcd85-5098-45e2-b395-e274b712d512/headshot_2020.jpg"/>
<link rel="header" href="https://chat.brainbaking.com/media/3399cd78-4fd4-40ab-a174-c7805576a826/boekcover2.jpg"/>
<ap_enabled>true</ap_enabled>
</author>
<link rel="next" href="https://chat.brainbaking.com/users/wouter/feed.atom?max_id=A4fIjNa6N1OJmaSMAS" type="application/atom+xml"/>
<entry>
<activity:object-type>http://activitystrea.ms/schema/1.0/note</activity:object-type>
<activity:verb>http://activitystrea.ms/schema/1.0/post</activity:verb>
<id>https://chat.brainbaking.com/objects/b5b67e88-eda8-45dd-ab8f-54443b62e250</id>
<title>some title</title>
<content type="html">&lt;span class=&quot;h-card&quot;&gt;@&lt;a class=&quot;u-url mention&quot; href=&quot;https://reply-to-stuff&quot; rel=&quot;ugc&quot;&gt; in reply to previous url test</content>
<published>2021-03-20T11:12:08.955177Z</published>
<updated>2021-03-20T11:12:08.955177Z</updated>
<ostatus:conversation ref="tag:mastodon.social,2021-03-20:objectId=227433498:objectType=Conversation">
tag:mastodon.social,2021-03-20:objectId=227433498:objectType=Conversation
</ostatus:conversation>
</entry>
</feed>

View File

@ -1,9 +1,20 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`mastodon feed parser tests parse creates MD with context if in-reply-to 1`] = `
"---
source: \\"https://chat.brainbaking.com/objects/2e58289c-f5f0-415c-b2e1-62c74662aa16\\"
context: \\"https://social.linux.pizza/users/StampedingLonghorn/statuses/105821099684887793\\"
title: \\"@StampedingLonghorn I tried to chase him away, but you know how that turned out... 😼 There's ...\\"
date: \\"2021-03-02T16:18:46\\"
---
<span class=\\"h-card\\"><a class=\\"u-url mention\\" data-user=\\"A4nwg4LYyh4WgrJOXg\\" href=\\"https://social.linux.pizza/@StampedingLonghorn\\" rel=\\"ugc\\">@<span>StampedingLonghorn</span></a></span> I tried to chase him away, but you know how that turned out... 😼 There's even cat hair inside the cases... (to be clear: also unintentional)
"
`;
exports[`mastodon feed parser tests parse creates correct MD structure 1`] = `
"---
source: \\"https://chat.brainbaking.com/objects/77a3ecfb-47e1-4d7a-a24a-8b779d80a8ac\\"
context: \\"\\"
title: \\"I pulled the Google plug and installed LineageOS: https://brainbaking.com/post/2021/03/getting-ri...\\"
date: \\"2021-03-01T19:03:35\\"
---
@ -15,7 +26,6 @@ I pulled the Google plug and installed LineageOS: <a href=\\"https://brainbaking
exports[`mastodon feed parser tests parse embedded images 1`] = `
"---
source: \\"https://chat.brainbaking.com/objects/a51e13ce-d618-4602-84f7-f398126510ff\\"
context: \\"\\"
title: \\"Enjoyed an afternoon of oldskool Diablo II on the ...\\"
date: \\"2021-03-14T17:41:53\\"
---

View File

@ -29,6 +29,16 @@ describe("mastodon feed parser tests", () => {
dir = await fsp.readdir(`${dumpdir}/2021/03`, { withFileTypes: true })
expect(dir.length).toBe(1)
})
test("does not ignore explicit '@url' replies if ignoreReplies is set to true", async () => {
await parseMastoFeed({
url: "masto-feed-at-url",
notesdir: dumpdir,
ignoreReplies: true
})
dir = await fsp.readdir(`${dumpdir}/2021/03`, { withFileTypes: true })
expect(dir.length).toBe(1)
})
test("does not ignore replies if ignoreReplies is set to false", async () => {
await parseMastoFeed({
url: "masto-feed-with-replies",
@ -127,9 +137,25 @@ describe("mastodon feed parser tests", () => {
titleCount: 5000
})
const actualMd = await fsp.readFile(`${dumpdir}/2021/03/02h16m18s46.md`)
const actualMd = (await fsp.readFile(`${dumpdir}/2021/03/02h16m18s46.md`)).toString()
expect(actualMd).toMatchSnapshot()
const expectedReplyTo = "https://social.linux.pizza/users/StampedingLonghorn/statuses/105821099684887793"
const md = frontMatterParser.parseSync(actualMd)
expect(md.data.context).toBe(expectedReplyTo)
})
test("parse creates MD with context if @http(s) URL", async () => {
await parseMastoFeed({
url: "masto-feed-at-url",
notesdir: dumpdir,
utcOffset: 0,
titleCount: 5000
})
const actualMd = await fsp.readFile(`${dumpdir}/2021/03/20h11m12s08.md`)
const expectedReplyTo = "https://reply-to-stuff"
const md = frontMatterParser.parseSync(actualMd.toString())
expect(md.data.context).toBe(expectedReplyTo)
})

View File

@ -24,7 +24,15 @@ describe("mastodon feed parser end to end scenario test", () => {
notesdir: dumpdir
})
let dir = await fsp.readdir(`${dumpdir}/2021/03`, { withFileTypes: true })
const dirroot = await fsp.readdir(`${dumpdir}`, { withFileTypes: true })
expect(dirroot.length).toBe(1)
const year = dirroot[0].name
const dirmonth = await fsp.readdir(`${dumpdir}/${year}`, { withFileTypes: true })
expect(dirmonth.length).toBe(1)
const month = dirmonth[0].name
const dir = await fsp.readdir(`${dumpdir}/${year}/${month}`, { withFileTypes: true })
expect(dir.length).not.toBe(0)
})