fix domain extraction from .co.uk using publicsuffix

This commit is contained in:
Wouter Groeneveld 2022-05-03 11:34:25 +02:00
parent 522cc3d746
commit f61bda5c5a
4 changed files with 39 additions and 12 deletions

1
go.mod
View File

@ -10,6 +10,7 @@ require (
github.com/rs/zerolog v1.21.0
github.com/stretchr/testify v1.7.0
github.com/tidwall/buntdb v1.2.3
golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4
golang.org/x/time v0.0.0-20220411224347-583f2d630306
willnorris.com/go/microformats v1.1.1
)

9
go.sum
View File

@ -56,18 +56,21 @@ golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73r
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 h1:HVyaeDAYux4pnY+D/SiwmLOR36ewZ4iGQIIrtnuCjFA=
golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba h1:O8mE0/t419eoIwhTFpKVkHiTs/Igowgfkj25AcZrtiE=
golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/time v0.0.0-20220411224347-583f2d630306 h1:+gHMid33q6pen7kv9xvT+JRinntgeXO2AeZVd0AWD3w=
golang.org/x/time v0.0.0-20220411224347-583f2d630306/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@ -2,7 +2,7 @@ package rest
import (
"encoding/json"
"fmt"
"golang.org/x/net/publicsuffix"
"net/http"
"net/url"
"strings"
@ -25,17 +25,24 @@ func Unauthorized(w http.ResponseWriter) {
// This is the same as conf.FetchDomain(wm.Target), only without config, and without error handling.
// Assumes http(s) protocol, which should have been validated before calling this.
func Domain(target string) string {
slashes := strings.Split(target, "/")
if len(slashes) < 3 {
url, err := url.Parse(target)
if err != nil {
return target
}
host := url.Hostname()
if host == "" {
return target
}
withPossibleSubdomain := slashes[2]
suffix, _ := publicsuffix.PublicSuffix(host)
withPossibleSubdomain := strings.ReplaceAll(host, "."+suffix, "")
split := strings.Split(withPossibleSubdomain, ".")
if len(split) <= 2 {
return withPossibleSubdomain // that was the extension, not the subdomain.
if len(split) <= 1 {
return host
}
return fmt.Sprintf("%s.%s", split[1], split[2])
return strings.Join(split[1:], ".") + "." + suffix
}
type imageType []byte
@ -52,7 +59,13 @@ var (
// SiloDomains are domains where mentions of multiple individuals may come from.
// These are privacy issues and will be anonymized as such.
SiloDomains = []string{"brid.gy", "twitter.com", "facebook.com"}
SiloDomains = []string{
"brid.gy",
"twitter.com",
"facebook.com",
"indieweb.social",
"mastodon.social",
}
)
// IsRealImage checks the first few bytes of the provided data to see if it's a real image.

View File

@ -94,7 +94,17 @@ func TestDomainParseFromTarget(t *testing.T) {
{
"parse from localhost domain without extension",
"https://localhost:1313/stuff",
"localhost:1313",
"localhost",
},
{
"UK domain with two dots after the name",
"https://minutestomidnight.co.uk/blog/article.html",
"minutestomidnight.co.uk",
},
{
"UK domain with subdomain",
"https://www.minutestomidnight.co.uk/blog/article.html",
"minutestomidnight.co.uk",
},
{
"malformed http string with too little slashes simply returns same URL",