diff --git a/lualib/lua_cta.lua b/lualib/lua_cta.lua index fb96d5422..071238034 100644 --- a/lualib/lua_cta.lua +++ b/lualib/lua_cta.lua @@ -43,6 +43,50 @@ local M = {} local lua_util = require "lua_util" local lua_maps = require "lua_maps" +local rspamd_util = require "rspamd_util" + +-- Reasonable defaults (can be overridden in rspamd.conf: link_affiliation { ... }) +local DEFAULT_STOPWORDS = { + -- Common TLD tokens and ccTLDs + "com", "net", "org", "info", "biz", "co", "io", "me", "us", "uk", "ru", "de", "fr", "au", "ca", "cn", "jp", "kr", "in", + "eu", + "es", "it", "nl", "pl", "se", "no", "fi", "dk", "cz", "sk", "pt", "tr", "gr", "hu", "ro", "bg", "ua", "by", "lt", "lv", + "ee", + "br", "mx", "ch", "be", "at", "dk", "cz", "sk", "pt", "ar", "cl", "pe", "tw", "th", "ph", "vn", "id", "hk", "sg", "nz", + "za", + "il", "ie", "is", "lu", "si", "hr", "rs", "gl", "ly", + -- Generic / infrastructural + "www", "web", "site", "app", "apps", "cloud", "cdn", "edge", "fastly", "akamai", "akamaihd", "edgesuite", "cloudfront", + -- Tracking/redirect/marketing tokens + "mail", "email", "news", "newsletter", "click", "link", "links", "go", "redir", "redirect", "rdir", "safe", "safelinks", + "trk", "track", "tracking", "ref", "mkt", "mktg", "campaign", "promo", "offer", "offers", + -- ESPs and bulk mailers (tokens found in their eTLD+1) + "mailchimp", "mandrill", "sendgrid", "sparkpost", "sparkpostmail", "amazonses", "ses", "postmark", "postmarkapp", + "mailgun", + "sendinblue", "constantcontact", "list", "manage", "rs6", "aweber", "hubspot", "campaignmonitor", "cmail", "klaviyo", + "sailthru", + "drip", "convertkit", "getresponse", "mautic", "braze", "acoustic", "responsys", "eloqua", "iterable", "sendy", + "emarsys", "mailjet", + "mailerlite", "mailerq", "mailrelay", "mailup", "omnisend", "clickdimensions", "dotdigital", "pepipost" +} + +local DEFAULT_WHITELIST = { + -- Intentionally empty by default. Users can add trusted eTLD+1 domains here +} + +local DEFAULT_BLACKLIST = { + -- Popular shorteners / redirection eTLD+1 + "t.co", "bit.ly", "goo.gl", "tinyurl.com", "lnkd.in", "buff.ly", "ow.ly", "rebrand.ly", "bitly.com", "is.gd", "v.gd", + "t.ly", + "cutt.ly", "shorturl.at", "reurl.cc", "rb.gy", "s.id", "trib.al", + -- Common ESP/tracker link domains (treat as non-affiliated by default) + "list-manage.com", "mandrillapp.com", "sendgrid.net", "sparkpostmail.com", "amazonses.com", "postmarkapp.com", + "mailgun.org", + "sendinblue.com", "constantcontact.com", "campaignmonitor.com", "cmail1.com", "cmail2.com", "aweber.com", "hubspot.com", + "exacttarget.com", "clickdimensions.com", "eloqua.com", "responsys.net", "emarsys.net", "mailjet.com", "klaviyo.com", + "dripemail2.com", + "getresponse.com", "benchmarkemail.com", "omnisend.com", "mailerlite.com", "dotdigital.com" +} local settings = { min_similarity = 0.5, @@ -57,14 +101,26 @@ local function load_settings() local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {} settings = lua_util.override_defaults(settings, opts) -- Convert map definitions to maps if needed - if settings.stopwords and (type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key) then - settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords') + if settings.stopwords then + if type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key then + settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords') + end + else + settings.stopwords = lua_maps.map_add_from_ucl(DEFAULT_STOPWORDS, 'set', 'link affiliation stopwords (default)') end - if settings.whitelist and (type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key) then - settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist') + if settings.whitelist then + if type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key then + settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist') + end + else + settings.whitelist = lua_maps.map_add_from_ucl(DEFAULT_WHITELIST, 'set', 'link affiliation whitelist (default)') end - if settings.blacklist and (type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key) then - settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist') + if settings.blacklist then + if type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key then + settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist') + end + else + settings.blacklist = lua_maps.map_add_from_ucl(DEFAULT_BLACKLIST, 'set', 'link affiliation blacklist (default)') end end @@ -115,13 +171,7 @@ M.process_html_links = function(task, part, ctx) local fp_tokens = etld1_tokens(first_party) for _, c in ipairs(cands) do - local etld1 = c.etld1 or c.host or '' - -- approximate etld1 from host when not provided (split last two labels) - do - local h = tostring(etld1) - local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$") - if p1 and p2 then etld1 = p1 .. "." .. p2 end - end + local etld1 = c.etld1 or rspamd_util.get_tld(c.host or '') or (c.host or '') local toks = etld1_tokens(etld1) local sim = jaccard(fp_tokens, toks) @@ -153,12 +203,7 @@ M.process_html_links = function(task, part, ctx) return a.part_order < b.part_order end) local cta = cands[1] - local etld1 = cta.etld1 or cta.host or '' - do - local h = tostring(etld1) - local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$") - if p1 and p2 then etld1 = p1 .. "." .. p2 end - end + local etld1 = cta.etld1 or rspamd_util.get_tld(cta.host or '') or (cta.host or '') local toks = etld1_tokens(etld1) local sim = jaccard(fp_tokens, toks) res.cta_affiliated = (sim >= settings.min_similarity) diff --git a/lualib/lua_selectors/transforms.lua b/lualib/lua_selectors/transforms.lua index 6c6bc7117..4234b115f 100644 --- a/lualib/lua_selectors/transforms.lua +++ b/lualib/lua_selectors/transforms.lua @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -]]-- +]] -- local fun = require 'fun' local lua_util = require "lua_util" @@ -202,7 +202,7 @@ the second argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`, end, ['description'] = 'Extracts substring; the first argument is start, the second is the last (like in Lua)', ['args_schema'] = { (ts.number + ts.string / tonumber):is_optional(), - (ts.number + ts.string / tonumber):is_optional() } + (ts.number + ts.string / tonumber):is_optional() } }, -- Prepends a string or a strings list ['prepend'] = { @@ -460,7 +460,7 @@ Empty string comes the first argument or 'true', non-empty string comes nil]], ['description'] = 'Applies mask to IP address.' .. ' The first argument is the mask for IPv4 addresses, the second is the mask for IPv6 addresses.', ['args_schema'] = { (ts.number + ts.string / tonumber), - (ts.number + ts.string / tonumber):is_optional() } + (ts.number + ts.string / tonumber):is_optional() } }, -- Returns the string(s) with all non ascii chars replaced ['to_ascii'] = { @@ -472,9 +472,9 @@ Empty string comes the first argument or 'true', non-empty string comes nil]], ['process'] = function(inp, _, args) if type(inp) == 'table' then return fun.map( - function(s) - return string.gsub(tostring(s), '[\128-\255]', args[1] or '?') - end, inp), 'string_list' + function(s) + return string.gsub(tostring(s), '[\128-\255]', args[1] or '?') + end, inp), 'string_list' else return string.gsub(tostring(inp), '[\128-\255]', '?'), 'string' end @@ -492,7 +492,7 @@ Empty string comes the first argument or 'true', non-empty string comes nil]], ['process'] = function(inp, _, _) return rspamd_util.get_tld(inp), 'string' end, - ['description'] = 'Extracts tld from a hostname represented as a string', + ['description'] = 'Returns effective second-level domain (eSLD) using the Public Suffix List', ['args_schema'] = {} }, -- Converts list of strings to numbers and returns a packed string diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c index 28c42f9f9..8e62318ab 100644 --- a/src/lua/lua_url.c +++ b/src/lua/lua_url.c @@ -663,7 +663,9 @@ lua_url_set_redirected(lua_State *L) /*** * @method url:get_tld() - * Get effective second level domain part (eSLD) of the url host + * Get effective second level domain (eSLD) of the URL host. + * This method uses the Public Suffix List (PSL) to determine boundaries + * and compute the eSLD, not the top-level domain. * @return {string} effective second level domain part (eSLD) of the url host */ static int diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index f2e9b8fa9..12323f5da 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -191,7 +191,9 @@ LUA_FUNCTION_DEF(util, humanize_number); /*** * @function util.get_tld(host) - * Returns effective second level domain part (eSLD) for the specified host + * Returns effective second level domain (eSLD) for the specified host. + * This function uses the Public Suffix List (PSL) to determine boundaries + * and compute the eSLD, not the top-level domain. * * @param {string} host hostname * @return {string} eSLD part of the hostname or the full hostname if eSLD was not found