Browse Source

[Project] Some rework of the CTA defaults

pull/5608/head
Vsevolod Stakhov 4 weeks ago
parent
commit
1e1103916f
No known key found for this signature in database GPG Key ID: 7647B6790081437
  1. 83
      lualib/lua_cta.lua
  2. 14
      lualib/lua_selectors/transforms.lua
  3. 4
      src/lua/lua_url.c
  4. 4
      src/lua/lua_util.c

83
lualib/lua_cta.lua

@ -43,6 +43,50 @@ local M = {}
local lua_util = require "lua_util"
local lua_maps = require "lua_maps"
local rspamd_util = require "rspamd_util"
-- Reasonable defaults (can be overridden in rspamd.conf: link_affiliation { ... })
local DEFAULT_STOPWORDS = {
-- Common TLD tokens and ccTLDs
"com", "net", "org", "info", "biz", "co", "io", "me", "us", "uk", "ru", "de", "fr", "au", "ca", "cn", "jp", "kr", "in",
"eu",
"es", "it", "nl", "pl", "se", "no", "fi", "dk", "cz", "sk", "pt", "tr", "gr", "hu", "ro", "bg", "ua", "by", "lt", "lv",
"ee",
"br", "mx", "ch", "be", "at", "dk", "cz", "sk", "pt", "ar", "cl", "pe", "tw", "th", "ph", "vn", "id", "hk", "sg", "nz",
"za",
"il", "ie", "is", "lu", "si", "hr", "rs", "gl", "ly",
-- Generic / infrastructural
"www", "web", "site", "app", "apps", "cloud", "cdn", "edge", "fastly", "akamai", "akamaihd", "edgesuite", "cloudfront",
-- Tracking/redirect/marketing tokens
"mail", "email", "news", "newsletter", "click", "link", "links", "go", "redir", "redirect", "rdir", "safe", "safelinks",
"trk", "track", "tracking", "ref", "mkt", "mktg", "campaign", "promo", "offer", "offers",
-- ESPs and bulk mailers (tokens found in their eTLD+1)
"mailchimp", "mandrill", "sendgrid", "sparkpost", "sparkpostmail", "amazonses", "ses", "postmark", "postmarkapp",
"mailgun",
"sendinblue", "constantcontact", "list", "manage", "rs6", "aweber", "hubspot", "campaignmonitor", "cmail", "klaviyo",
"sailthru",
"drip", "convertkit", "getresponse", "mautic", "braze", "acoustic", "responsys", "eloqua", "iterable", "sendy",
"emarsys", "mailjet",
"mailerlite", "mailerq", "mailrelay", "mailup", "omnisend", "clickdimensions", "dotdigital", "pepipost"
}
local DEFAULT_WHITELIST = {
-- Intentionally empty by default. Users can add trusted eTLD+1 domains here
}
local DEFAULT_BLACKLIST = {
-- Popular shorteners / redirection eTLD+1
"t.co", "bit.ly", "goo.gl", "tinyurl.com", "lnkd.in", "buff.ly", "ow.ly", "rebrand.ly", "bitly.com", "is.gd", "v.gd",
"t.ly",
"cutt.ly", "shorturl.at", "reurl.cc", "rb.gy", "s.id", "trib.al",
-- Common ESP/tracker link domains (treat as non-affiliated by default)
"list-manage.com", "mandrillapp.com", "sendgrid.net", "sparkpostmail.com", "amazonses.com", "postmarkapp.com",
"mailgun.org",
"sendinblue.com", "constantcontact.com", "campaignmonitor.com", "cmail1.com", "cmail2.com", "aweber.com", "hubspot.com",
"exacttarget.com", "clickdimensions.com", "eloqua.com", "responsys.net", "emarsys.net", "mailjet.com", "klaviyo.com",
"dripemail2.com",
"getresponse.com", "benchmarkemail.com", "omnisend.com", "mailerlite.com", "dotdigital.com"
}
local settings = {
min_similarity = 0.5,
@ -57,14 +101,26 @@ local function load_settings()
local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {}
settings = lua_util.override_defaults(settings, opts)
-- Convert map definitions to maps if needed
if settings.stopwords and (type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key) then
settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords')
if settings.stopwords then
if type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key then
settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords')
end
else
settings.stopwords = lua_maps.map_add_from_ucl(DEFAULT_STOPWORDS, 'set', 'link affiliation stopwords (default)')
end
if settings.whitelist and (type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key) then
settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist')
if settings.whitelist then
if type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key then
settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist')
end
else
settings.whitelist = lua_maps.map_add_from_ucl(DEFAULT_WHITELIST, 'set', 'link affiliation whitelist (default)')
end
if settings.blacklist and (type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key) then
settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist')
if settings.blacklist then
if type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key then
settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist')
end
else
settings.blacklist = lua_maps.map_add_from_ucl(DEFAULT_BLACKLIST, 'set', 'link affiliation blacklist (default)')
end
end
@ -115,13 +171,7 @@ M.process_html_links = function(task, part, ctx)
local fp_tokens = etld1_tokens(first_party)
for _, c in ipairs(cands) do
local etld1 = c.etld1 or c.host or ''
-- approximate etld1 from host when not provided (split last two labels)
do
local h = tostring(etld1)
local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$")
if p1 and p2 then etld1 = p1 .. "." .. p2 end
end
local etld1 = c.etld1 or rspamd_util.get_tld(c.host or '') or (c.host or '')
local toks = etld1_tokens(etld1)
local sim = jaccard(fp_tokens, toks)
@ -153,12 +203,7 @@ M.process_html_links = function(task, part, ctx)
return a.part_order < b.part_order
end)
local cta = cands[1]
local etld1 = cta.etld1 or cta.host or ''
do
local h = tostring(etld1)
local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$")
if p1 and p2 then etld1 = p1 .. "." .. p2 end
end
local etld1 = cta.etld1 or rspamd_util.get_tld(cta.host or '') or (cta.host or '')
local toks = etld1_tokens(etld1)
local sim = jaccard(fp_tokens, toks)
res.cta_affiliated = (sim >= settings.min_similarity)

14
lualib/lua_selectors/transforms.lua

@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]--
]] --
local fun = require 'fun'
local lua_util = require "lua_util"
@ -202,7 +202,7 @@ the second argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`,
end,
['description'] = 'Extracts substring; the first argument is start, the second is the last (like in Lua)',
['args_schema'] = { (ts.number + ts.string / tonumber):is_optional(),
(ts.number + ts.string / tonumber):is_optional() }
(ts.number + ts.string / tonumber):is_optional() }
},
-- Prepends a string or a strings list
['prepend'] = {
@ -460,7 +460,7 @@ Empty string comes the first argument or 'true', non-empty string comes nil]],
['description'] = 'Applies mask to IP address.' ..
' The first argument is the mask for IPv4 addresses, the second is the mask for IPv6 addresses.',
['args_schema'] = { (ts.number + ts.string / tonumber),
(ts.number + ts.string / tonumber):is_optional() }
(ts.number + ts.string / tonumber):is_optional() }
},
-- Returns the string(s) with all non ascii chars replaced
['to_ascii'] = {
@ -472,9 +472,9 @@ Empty string comes the first argument or 'true', non-empty string comes nil]],
['process'] = function(inp, _, args)
if type(inp) == 'table' then
return fun.map(
function(s)
return string.gsub(tostring(s), '[\128-\255]', args[1] or '?')
end, inp), 'string_list'
function(s)
return string.gsub(tostring(s), '[\128-\255]', args[1] or '?')
end, inp), 'string_list'
else
return string.gsub(tostring(inp), '[\128-\255]', '?'), 'string'
end
@ -492,7 +492,7 @@ Empty string comes the first argument or 'true', non-empty string comes nil]],
['process'] = function(inp, _, _)
return rspamd_util.get_tld(inp), 'string'
end,
['description'] = 'Extracts tld from a hostname represented as a string',
['description'] = 'Returns effective second-level domain (eSLD) using the Public Suffix List',
['args_schema'] = {}
},
-- Converts list of strings to numbers and returns a packed string

4
src/lua/lua_url.c

@ -663,7 +663,9 @@ lua_url_set_redirected(lua_State *L)
/***
* @method url:get_tld()
* Get effective second level domain part (eSLD) of the url host
* Get effective second level domain (eSLD) of the URL host.
* This method uses the Public Suffix List (PSL) to determine boundaries
* and compute the eSLD, not the top-level domain.
* @return {string} effective second level domain part (eSLD) of the url host
*/
static int

4
src/lua/lua_util.c

@ -191,7 +191,9 @@ LUA_FUNCTION_DEF(util, humanize_number);
/***
* @function util.get_tld(host)
* Returns effective second level domain part (eSLD) for the specified host
* Returns effective second level domain (eSLD) for the specified host.
* This function uses the Public Suffix List (PSL) to determine boundaries
* and compute the eSLD, not the top-level domain.
*
* @param {string} host hostname
* @return {string} eSLD part of the hostname or the full hostname if eSLD was not found

Loading…
Cancel
Save