mirror of https://github.com/rspamd/rspamd.git
Rapid spam filtering system
https://rspamd.com/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
216 lines
8.4 KiB
216 lines
8.4 KiB
--[[
|
|
CTA and link affiliation analysis
|
|
|
|
Purpose:
|
|
- Given a capped list of candidate links extracted in C during HTML parsing,
|
|
compute simple affiliation scores between those links and the sender’s
|
|
first-party domain, and pick a likely CTA (call-to-action) link.
|
|
|
|
How it is called:
|
|
- C code (message processing after HTML parsing) loads this function via
|
|
`rspamd_lua_require_function(L, "lua_cta", "process_html_links")` and calls
|
|
`process_html_links(task, part, ctx)` once per HTML text part.
|
|
|
|
Inputs (ctx table):
|
|
- links_total: total number of links in the part (summary; may be omitted)
|
|
- domains_total: number of distinct link domains (summary)
|
|
- max_links_single_domain: maximum links seen for a single domain (summary)
|
|
- candidates: array (capped in C, default 24) of small objects with fields:
|
|
- host: link host (string)
|
|
- idn, numeric, has_port, has_query, display_mismatch: booleans
|
|
- order, part_order: integers (ordering hints)
|
|
- etld1: optional eTLD+1 (if not set, this module approximates from host)
|
|
|
|
Outputs (returned table):
|
|
- cta_affiliated: boolean – whether the selected CTA appears affiliated
|
|
- cta_weight: number – simple weight hint (e.g. 1.0 if display mismatch)
|
|
- affiliated_ratio: number – fraction of candidates considered affiliated
|
|
- trackerish_ratio: number – fraction of candidates that look trackerish
|
|
|
|
Configuration (rspamd.conf):
|
|
- Use the `link_affiliation { ... }` section.
|
|
- Options:
|
|
- stopwords: map (set/regexp/glob) used to strip common tracking tokens from
|
|
domains when computing token overlap
|
|
- whitelist / blacklist: optional maps (set) to tweak affiliation
|
|
- min_similarity: number (default 0.5) – Jaccard threshold for affiliation
|
|
- max_candidates: number (default 24) – extra Lua-side cap (C caps as well)
|
|
|
|
This module keeps all heavy config logic in Lua using lua_maps and only relies
|
|
on C to provide a bounded set of safe, pre-filtered candidates.
|
|
]]
|
|
local M = {}
|
|
|
|
local lua_util = require "lua_util"
|
|
local lua_maps = require "lua_maps"
|
|
local rspamd_util = require "rspamd_util"
|
|
|
|
-- Reasonable defaults (can be overridden in rspamd.conf: link_affiliation { ... })
|
|
local DEFAULT_STOPWORDS = {
|
|
-- Common TLD tokens and ccTLDs
|
|
"com", "net", "org", "info", "biz", "co", "io", "me", "us", "uk", "ru", "de", "fr", "au", "ca", "cn", "jp", "kr", "in",
|
|
"eu",
|
|
"es", "it", "nl", "pl", "se", "no", "fi", "dk", "cz", "sk", "pt", "tr", "gr", "hu", "ro", "bg", "ua", "by", "lt", "lv",
|
|
"ee",
|
|
"br", "mx", "ch", "be", "at", "dk", "cz", "sk", "pt", "ar", "cl", "pe", "tw", "th", "ph", "vn", "id", "hk", "sg", "nz",
|
|
"za",
|
|
"il", "ie", "is", "lu", "si", "hr", "rs", "gl", "ly",
|
|
-- Generic / infrastructural
|
|
"www", "web", "site", "app", "apps", "cloud", "cdn", "edge", "fastly", "akamai", "akamaihd", "edgesuite", "cloudfront",
|
|
-- Tracking/redirect/marketing tokens
|
|
"mail", "email", "news", "newsletter", "click", "link", "links", "go", "redir", "redirect", "rdir", "safe", "safelinks",
|
|
"trk", "track", "tracking", "ref", "mkt", "mktg", "campaign", "promo", "offer", "offers",
|
|
-- ESPs and bulk mailers (tokens found in their eTLD+1)
|
|
"mailchimp", "mandrill", "sendgrid", "sparkpost", "sparkpostmail", "amazonses", "ses", "postmark", "postmarkapp",
|
|
"mailgun",
|
|
"sendinblue", "constantcontact", "list", "manage", "rs6", "aweber", "hubspot", "campaignmonitor", "cmail", "klaviyo",
|
|
"sailthru",
|
|
"drip", "convertkit", "getresponse", "mautic", "braze", "acoustic", "responsys", "eloqua", "iterable", "sendy",
|
|
"emarsys", "mailjet",
|
|
"mailerlite", "mailerq", "mailrelay", "mailup", "omnisend", "clickdimensions", "dotdigital", "pepipost"
|
|
}
|
|
|
|
local DEFAULT_WHITELIST = {
|
|
-- Intentionally empty by default. Users can add trusted eTLD+1 domains here
|
|
}
|
|
|
|
local DEFAULT_BLACKLIST = {
|
|
-- Popular shorteners / redirection eTLD+1
|
|
"t.co", "bit.ly", "goo.gl", "tinyurl.com", "lnkd.in", "buff.ly", "ow.ly", "rebrand.ly", "bitly.com", "is.gd", "v.gd",
|
|
"t.ly",
|
|
"cutt.ly", "shorturl.at", "reurl.cc", "rb.gy", "s.id", "trib.al",
|
|
-- Common ESP/tracker link domains (treat as non-affiliated by default)
|
|
"list-manage.com", "mandrillapp.com", "sendgrid.net", "sparkpostmail.com", "amazonses.com", "postmarkapp.com",
|
|
"mailgun.org",
|
|
"sendinblue.com", "constantcontact.com", "campaignmonitor.com", "cmail1.com", "cmail2.com", "aweber.com", "hubspot.com",
|
|
"exacttarget.com", "clickdimensions.com", "eloqua.com", "responsys.net", "emarsys.net", "mailjet.com", "klaviyo.com",
|
|
"dripemail2.com",
|
|
"getresponse.com", "benchmarkemail.com", "omnisend.com", "mailerlite.com", "dotdigital.com"
|
|
}
|
|
|
|
local settings = {
|
|
min_similarity = 0.5,
|
|
max_candidates = 24,
|
|
stopwords = nil,
|
|
whitelist = nil,
|
|
blacklist = nil,
|
|
}
|
|
|
|
local function load_settings()
|
|
local cfg = rawget(_G, 'rspamd_config')
|
|
local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {}
|
|
settings = lua_util.override_defaults(settings, opts)
|
|
-- Convert map definitions to maps if needed
|
|
if settings.stopwords then
|
|
if type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key then
|
|
settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords')
|
|
end
|
|
else
|
|
settings.stopwords = lua_maps.map_add_from_ucl(DEFAULT_STOPWORDS, 'set', 'link affiliation stopwords (default)')
|
|
end
|
|
if settings.whitelist then
|
|
if type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key then
|
|
settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist')
|
|
end
|
|
else
|
|
settings.whitelist = lua_maps.map_add_from_ucl(DEFAULT_WHITELIST, 'set', 'link affiliation whitelist (default)')
|
|
end
|
|
if settings.blacklist then
|
|
if type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key then
|
|
settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist')
|
|
end
|
|
else
|
|
settings.blacklist = lua_maps.map_add_from_ucl(DEFAULT_BLACKLIST, 'set', 'link affiliation blacklist (default)')
|
|
end
|
|
end
|
|
|
|
load_settings()
|
|
|
|
local function etld1_tokens(dom)
|
|
local t = {}
|
|
for token in string.gmatch(string.lower(dom or ''), "[a-z0-9]+") do
|
|
if not (settings.stopwords and settings.stopwords:get_key(token)) then
|
|
t[token] = true
|
|
end
|
|
end
|
|
return t
|
|
end
|
|
|
|
local function jaccard(a, b)
|
|
local inter, uni = 0, 0
|
|
for k in pairs(a) do
|
|
if b[k] then inter = inter + 1 end
|
|
uni = uni + 1
|
|
end
|
|
for k in pairs(b) do
|
|
if not a[k] then uni = uni + 1 end
|
|
end
|
|
if uni == 0 then return 0 end
|
|
return inter / uni
|
|
end
|
|
|
|
M.process_html_links = function(task, part, ctx)
|
|
local first_party = nil
|
|
-- Derive first-party from From: if not provided
|
|
do
|
|
local from = task:get_from('mime') or {}
|
|
if from[1] and from[1].domain then
|
|
first_party = from[1].domain
|
|
end
|
|
end
|
|
|
|
local cands = ctx.candidates or {}
|
|
if #cands > settings.max_candidates then
|
|
local tmp = {}
|
|
for i = 1, settings.max_candidates do tmp[i] = cands[i] end
|
|
cands = tmp
|
|
end
|
|
local affiliated = 0
|
|
local trackerish = 0
|
|
|
|
local fp_tokens = etld1_tokens(first_party)
|
|
|
|
for _, c in ipairs(cands) do
|
|
local etld1 = c.etld1 or rspamd_util.get_tld(c.host or '') or (c.host or '')
|
|
|
|
local toks = etld1_tokens(etld1)
|
|
local sim = jaccard(fp_tokens, toks)
|
|
|
|
if sim >= settings.min_similarity then
|
|
affiliated = affiliated + 1
|
|
end
|
|
|
|
-- very naive trackerish: all tokens are stopwords or too few tokens
|
|
local n_tokens, n_nonstop = 0, 0
|
|
for _ in pairs(toks) do
|
|
n_tokens = n_tokens + 1; n_nonstop = n_nonstop + 1
|
|
end
|
|
if n_nonstop == 0 then trackerish = trackerish + 1 end
|
|
end
|
|
|
|
local res = {
|
|
affiliated_ratio = (#cands > 0) and (affiliated / #cands) or 0,
|
|
trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0,
|
|
}
|
|
|
|
-- Simple CTA guess: prefer higher C-side weight, then display_mismatch, then earliest order
|
|
if #cands > 0 then
|
|
table.sort(cands, function(a, b)
|
|
local aw, bw = tonumber(a.weight) or 0, tonumber(b.weight) or 0
|
|
if aw ~= bw then return aw > bw end
|
|
if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end
|
|
if a.order ~= b.order then return a.order < b.order end
|
|
return a.part_order < b.part_order
|
|
end)
|
|
local cta = cands[1]
|
|
local etld1 = cta.etld1 or rspamd_util.get_tld(cta.host or '') or (cta.host or '')
|
|
local toks = etld1_tokens(etld1)
|
|
local sim = jaccard(fp_tokens, toks)
|
|
res.cta_affiliated = (sim >= settings.min_similarity)
|
|
res.cta_weight = (cta.display_mismatch and 1.0 or 0.5)
|
|
end
|
|
|
|
return res
|
|
end
|
|
|
|
return M
|