mirror of https://github.com/rspamd/rspamd.git
Rapid spam filtering system
https://rspamd.com/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
438 lines
12 KiB
438 lines
12 KiB
-- Licensed to the Apache Software Foundation (ASF) under one or more
|
|
-- contributor license agreements. See the NOTICE file distributed with
|
|
-- this work for additional information regarding copyright ownership.
|
|
-- The ASF licenses this file to you under the Apache License, Version 2.0
|
|
-- (the "License"); you may not use this file except in compliance with
|
|
-- the License. You may obtain a copy of the License at:
|
|
--
|
|
-- http://www.apache.org/licenses/LICENSE-2.0
|
|
--
|
|
-- Unless required by applicable law or agreed to in writing, software
|
|
-- distributed under the License is distributed on an "AS IS" BASIS,
|
|
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
-- See the License for the specific language governing permissions and
|
|
-- limitations under the License.
|
|
|
|
local reconf = config['regexp']
|
|
|
|
-- Messages that have only HTML part
|
|
reconf['MIME_HTML_ONLY'] = {
|
|
re = 'has_only_html_part()',
|
|
score = 0.2,
|
|
description = 'Messages that have only HTML part',
|
|
group = 'headers'
|
|
}
|
|
|
|
local function has_anchor_parent(tag)
|
|
local parent = tag
|
|
repeat
|
|
parent = parent:get_parent()
|
|
if parent then
|
|
if parent:get_type() == 'a' then
|
|
return true
|
|
end
|
|
end
|
|
until not parent
|
|
|
|
return false
|
|
end
|
|
|
|
local function check_html_image(task, min, max)
|
|
local tp = task:get_text_parts()
|
|
|
|
for _,p in ipairs(tp) do
|
|
if p:is_html() then
|
|
local hc = p:get_html()
|
|
local len = p:get_length()
|
|
|
|
|
|
if hc and len >= min and len < max then
|
|
local images = hc:get_images()
|
|
if images then
|
|
for _,i in ipairs(images) do
|
|
local tag = i['tag']
|
|
if tag then
|
|
if has_anchor_parent(tag) then
|
|
-- do not trigger on small and unknown size images
|
|
if i['height'] + i['width'] >= 210 or not i['embedded'] then
|
|
return true
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
rspamd_config.HTML_SHORT_LINK_IMG_1 = {
|
|
callback = function(task)
|
|
return check_html_image(task, 0, 1024)
|
|
end,
|
|
score = 2.0,
|
|
group = 'html',
|
|
description = 'Short html part (0..1K) with a link to an image'
|
|
}
|
|
|
|
rspamd_config.HTML_SHORT_LINK_IMG_2 = {
|
|
callback = function(task)
|
|
return check_html_image(task, 1024, 1536)
|
|
end,
|
|
score = 1.0,
|
|
group = 'html',
|
|
description = 'Short html part (1K..1.5K) with a link to an image'
|
|
}
|
|
|
|
rspamd_config.HTML_SHORT_LINK_IMG_3 = {
|
|
callback = function(task)
|
|
return check_html_image(task, 1536, 2048)
|
|
end,
|
|
score = 0.5,
|
|
group = 'html',
|
|
description = 'Short html part (1.5K..2K) with a link to an image'
|
|
}
|
|
|
|
rspamd_config.R_EMPTY_IMAGE = {
|
|
callback = function(task)
|
|
local tp = task:get_text_parts() -- get text parts in a message
|
|
|
|
for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
|
|
if p:is_html() then -- if the current part is html part
|
|
local hc = p:get_html() -- we get HTML context
|
|
local len = p:get_length() -- and part's length
|
|
if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
|
|
local images = hc:get_images() -- then we check for HTML images
|
|
|
|
if images then -- if there are images
|
|
for _,i in ipairs(images) do -- then iterate over images in the part
|
|
if i['height'] + i['width'] >= 400 then -- if we have a large image
|
|
local tag = i['tag']
|
|
if tag then
|
|
if not has_anchor_parent(tag) then
|
|
return true
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end,
|
|
|
|
score = 2.0,
|
|
group = 'html',
|
|
description = 'Message contains empty parts and image'
|
|
}
|
|
|
|
rspamd_config.R_SUSPICIOUS_IMAGES = {
|
|
callback = function(task)
|
|
local tp = task:get_text_parts() -- get text parts in a message
|
|
|
|
for _, p in ipairs(tp) do
|
|
local h = p:get_html()
|
|
|
|
if h then
|
|
local l = p:get_words_count()
|
|
local img = h:get_images()
|
|
local pic_words = 0
|
|
|
|
if img then
|
|
for _, i in ipairs(img) do
|
|
local dim = i['width'] + i['height']
|
|
local tag = i['tag']
|
|
|
|
if tag then
|
|
if has_anchor_parent(tag) then
|
|
if dim > 100 and dim < 3000 then
|
|
-- We assume that a single picture 100x200 contains approx 3 words of text
|
|
pic_words = pic_words + dim / 100
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
if l + pic_words > 0 then
|
|
local rel = pic_words / (l + pic_words)
|
|
|
|
if rel > 0.5 then
|
|
return true, (rel - 0.5) * 2
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
return false
|
|
end,
|
|
|
|
score = 5.0,
|
|
group = 'html',
|
|
description = 'Message contains many suspicious messages'
|
|
}
|
|
|
|
local vis_check_id = rspamd_config:register_symbol{
|
|
name = 'HTML_VISIBLE_CHECKS',
|
|
type = 'callback',
|
|
group = 'html',
|
|
callback = function(task)
|
|
--local logger = require "rspamd_logger"
|
|
local tp = task:get_text_parts() -- get text parts in a message
|
|
local ret = false
|
|
local diff = 0.0
|
|
local transp_rate = 0
|
|
local invisible_blocks = 0
|
|
local zero_size_blocks = 0
|
|
local arg
|
|
|
|
local normal_len = 0
|
|
local transp_len = 0
|
|
|
|
for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
|
|
normal_len = normal_len + p:get_length()
|
|
if p:is_html() and p:get_html() then -- if the current part is html part
|
|
local hc = p:get_html() -- we get HTML context
|
|
|
|
hc:foreach_tag({'font', 'span', 'div', 'p', 'td'}, function(tag)
|
|
local bl = tag:get_extra()
|
|
if bl then
|
|
if not bl['visible'] then
|
|
invisible_blocks = invisible_blocks + 1
|
|
end
|
|
|
|
if bl['font_size'] and bl['font_size'] == 0 then
|
|
zero_size_blocks = zero_size_blocks + 1
|
|
end
|
|
|
|
if bl['bgcolor'] and bl['color'] and bl['visible'] then
|
|
|
|
local color = bl['color']
|
|
local bgcolor = bl['bgcolor']
|
|
-- Should use visual approach here some day
|
|
local diff_r = math.abs(color[1] - bgcolor[1])
|
|
local diff_g = math.abs(color[2] - bgcolor[2])
|
|
local diff_b = math.abs(color[3] - bgcolor[3])
|
|
local r_avg = (color[1] + bgcolor[1]) / 2.0
|
|
-- Square
|
|
diff_r = diff_r * diff_r
|
|
diff_g = diff_g * diff_g
|
|
diff_b = diff_b * diff_b
|
|
|
|
diff = math.sqrt(2*diff_r + 4*diff_g + 3 * diff_b +
|
|
(r_avg * (diff_r - diff_b) / 256.0))
|
|
diff = diff / 256.0
|
|
|
|
if diff < 0.1 then
|
|
ret = true
|
|
local content_len = #(tag:get_content() or {})
|
|
invisible_blocks = invisible_blocks + 1 -- This block is invisible
|
|
transp_len = transp_len + content_len * (0.1 - diff) * 10.0
|
|
normal_len = normal_len - content_len
|
|
local tr = transp_len / (normal_len + transp_len)
|
|
if tr > transp_rate then
|
|
transp_rate = tr
|
|
arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
|
|
tostring(tag:get_type()),
|
|
color[1], color[2], color[3],
|
|
bgcolor[1], bgcolor[2], bgcolor[3])
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
return false -- Continue search
|
|
end)
|
|
|
|
end
|
|
end
|
|
|
|
if ret then
|
|
transp_rate = transp_len / (normal_len + transp_len)
|
|
|
|
if transp_rate > 0.1 then
|
|
if transp_rate > 0.5 or transp_rate ~= transp_rate then
|
|
transp_rate = 0.5
|
|
end
|
|
|
|
task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
|
|
end
|
|
end
|
|
|
|
if invisible_blocks > 0 then
|
|
if invisible_blocks > 10 then
|
|
invisible_blocks = 10
|
|
end
|
|
local rates = { -- From 1 to 10
|
|
0.05,
|
|
0.1,
|
|
0.2,
|
|
0.3,
|
|
0.4,
|
|
0.5,
|
|
0.6,
|
|
0.7,
|
|
0.8,
|
|
1.0,
|
|
}
|
|
task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
|
|
tostring(invisible_blocks))
|
|
end
|
|
|
|
if zero_size_blocks > 0 then
|
|
if zero_size_blocks > 5 then
|
|
if zero_size_blocks > 10 then
|
|
-- Full score
|
|
task:insert_result('ZERO_FONT', 1.0,
|
|
tostring(zero_size_blocks))
|
|
else
|
|
zero_size_blocks = 5
|
|
end
|
|
end
|
|
|
|
if zero_size_blocks <= 5 then
|
|
local rates = { -- From 1 to 5
|
|
0.1,
|
|
0.2,
|
|
0.2,
|
|
0.3,
|
|
0.5,
|
|
}
|
|
task:insert_result('ZERO_FONT', rates[zero_size_blocks],
|
|
tostring(zero_size_blocks))
|
|
end
|
|
end
|
|
end,
|
|
}
|
|
|
|
rspamd_config:register_symbol{
|
|
type = 'virtual',
|
|
parent = vis_check_id,
|
|
name = 'R_WHITE_ON_WHITE',
|
|
description = 'Message contains low contrast text',
|
|
score = 4.0,
|
|
group = 'html',
|
|
one_shot = true,
|
|
}
|
|
|
|
rspamd_config:register_symbol{
|
|
type = 'virtual',
|
|
parent = vis_check_id,
|
|
name = 'ZERO_FONT',
|
|
description = 'Zero sized font used',
|
|
score = 1.0, -- Reached if more than 5 elements have zero size
|
|
one_shot = true,
|
|
group = 'html'
|
|
}
|
|
|
|
rspamd_config:register_symbol{
|
|
type = 'virtual',
|
|
parent = vis_check_id,
|
|
name = 'MANY_INVISIBLE_PARTS',
|
|
description = 'Many parts are visually hidden',
|
|
score = 1.0, -- Reached if more than 10 elements are hidden
|
|
one_shot = true,
|
|
group = 'html'
|
|
}
|
|
|
|
rspamd_config.EXT_CSS = {
|
|
callback = function(task)
|
|
local regexp_lib = require "rspamd_regexp"
|
|
local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
|
|
local tp = task:get_text_parts() -- get text parts in a message
|
|
local ret = false
|
|
for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
|
|
if p:is_html() and p:get_html() then -- if the current part is html part
|
|
local hc = p:get_html() -- we get HTML context
|
|
hc:foreach_tag({'link'}, function(tag)
|
|
local bl = tag:get_extra()
|
|
if bl then
|
|
local s = tostring(bl)
|
|
if s and re:match(s) then
|
|
ret = true
|
|
end
|
|
end
|
|
|
|
return ret -- Continue search
|
|
end)
|
|
|
|
end
|
|
end
|
|
|
|
return ret
|
|
end,
|
|
|
|
score = 1.0,
|
|
group = 'html',
|
|
description = 'Message contains external CSS reference'
|
|
}
|
|
|
|
rspamd_config.HTTP_TO_HTTPS = {
|
|
callback = function(task)
|
|
local tp = task:get_text_parts()
|
|
if (not tp) then return false end
|
|
for _,p in ipairs(tp) do
|
|
if p:is_html() then
|
|
local hc = p:get_html()
|
|
if (not hc) then return false end
|
|
local found = false
|
|
hc:foreach_tag('a', function (tag, length)
|
|
-- Skip this loop if we already have a match
|
|
if (found) then return true end
|
|
local c = tag:get_content()
|
|
if (c) then
|
|
c = tostring(c):lower()
|
|
if (not c:match('^http')) then return false end
|
|
local u = tag:get_extra()
|
|
if (not u) then return false end
|
|
u = tostring(u):lower()
|
|
if (not u:match('^http')) then return false end
|
|
if ((c:match('^http:') and u:match('^https:')) or
|
|
(c:match('^https:') and u:match('^http:')))
|
|
then
|
|
found = true
|
|
return true
|
|
end
|
|
end
|
|
return false
|
|
end)
|
|
if (found) then return true end
|
|
return false
|
|
end
|
|
end
|
|
return false
|
|
end,
|
|
description = 'Anchor text contains different scheme to target URL',
|
|
score = 2.0,
|
|
group = 'html'
|
|
}
|
|
|
|
rspamd_config.HTTP_TO_IP = {
|
|
callback = function(task)
|
|
local tp = task:get_text_parts()
|
|
if (not tp) then return false end
|
|
for _,p in ipairs(tp) do
|
|
if p:is_html() then
|
|
local hc = p:get_html()
|
|
if (not hc) then return false end
|
|
local found = false
|
|
hc:foreach_tag('a', function (tag, length)
|
|
if (found) then return true end
|
|
local u = tag:get_extra()
|
|
if (u) then
|
|
u = tostring(u):lower()
|
|
if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
|
|
found = true
|
|
end
|
|
end
|
|
return false
|
|
end)
|
|
if found then return true end
|
|
return false
|
|
end
|
|
end
|
|
end,
|
|
description = 'Anchor points to an IP address',
|
|
score = 1.0,
|
|
group = 'html'
|
|
}
|