Browse Source

[Test] WHITELIST_SURBL

pull/3032/head
korgoth1 6 years ago
parent
commit
b38a829826
  1. 198
      lualib/lua_magic/init.lua
  2. 302
      lualib/lua_magic/patterns.lua
  3. 113
      lualib/lua_magic/types.lua
  4. 2
      src/lua/lua_trie.c
  5. 3
      test/functional/cases/340_surbl.robot
  6. 10
      test/functional/configs/plugins.conf
  7. 2
      test/functional/messages/whitelist.eml

198
lualib/lua_magic/init.lua

@ -24,23 +24,89 @@ local types = require "lua_magic/types"
local fun = require "fun"
local lua_util = require "lua_util"
local rspamd_text = require "rspamd_text"
local rspamd_trie = require "rspamd_trie"
local N = "lua_magic"
local exports = {}
-- trie object
-- trie objects
local compiled_patterns
local compiled_short_patterns
local compiled_tail_patterns
-- {<str>, <match_object>, <pattern_object>} indexed by pattern number
local processed_patterns = {}
local short_patterns = {}
local tail_patterns = {}
local short_match_limit = 128
local max_short_offset = -1
local min_tail_offset = math.huge
local function process_patterns(log_obj)
-- Add pattern to either short patterns or to normal patterns
local function add_processed(str, match, pattern)
if match.position and type(match.position) == 'number' then
if match.tail then
-- Tail pattern
tail_patterns[#tail_patterns + 1] = {
str, match, pattern
}
if min_tail_offset > match.tail then
min_tail_offset = match.tail
end
lua_util.debugm(N, log_obj, 'add tail pattern %s for ext %s',
str, pattern.ext)
elseif match.position < short_match_limit then
short_patterns[#short_patterns + 1] = {
str, match, pattern
}
lua_util.debugm(N, log_obj, 'add short pattern %s for ext %s',
str, pattern.ext)
if max_short_offset < match.position then
max_short_offset = match.position
end
else
processed_patterns[#processed_patterns + 1] = {
str, match, pattern
}
lua_util.debugm(N, log_obj, 'add long pattern %s for ext %s',
str, pattern.ext)
end
else
processed_patterns[#processed_patterns + 1] = {
str, match, pattern
}
lua_util.debugm(N, log_obj, 'add long pattern %s for ext %s',
str, pattern.ext)
end
end
local function process_patterns()
if not compiled_patterns then
for _,pattern in ipairs(patterns) do
for ext,pattern in pairs(patterns) do
assert(types[ext], 'not found type: ' .. ext)
pattern.ext = ext
for _,match in ipairs(pattern.matches) do
if match.string then
processed_patterns[#processed_patterns + 1] = {
match.string, match, pattern
}
if match.relative_position and not match.position then
match.position = match.relative_position + #match.string
end
add_processed(match.string, match, pattern)
elseif match.hex then
local hex_table = {}
for i=1,#match.hex,2 do
local subc = match.hex:sub(i, i + 1)
hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
end
if match.relative_position and not match.position then
match.position = match.relative_position + #match.hex / 2
end
add_processed(table.concat(hex_table), match, pattern)
end
end
end
@ -49,18 +115,26 @@ local function process_patterns()
fun.map(function(t) return t[1] end, processed_patterns)),
rspamd_trie.flags.re
)
compiled_short_patterns = rspamd_trie.create(fun.totable(
fun.map(function(t) return t[1] end, short_patterns)),
rspamd_trie.flags.re
)
compiled_tail_patterns = rspamd_trie.create(fun.totable(
fun.map(function(t) return t[1] end, tail_patterns)),
rspamd_trie.flags.re
)
lua_util.debugm(N, rspamd_config, 'compiled %s patterns',
#processed_patterns)
lua_util.debugm(N, log_obj,
'compiled %s (%s short; %s long; %s tail) patterns',
#processed_patterns + #short_patterns + #tail_patterns,
#short_patterns, #processed_patterns, #tail_patterns)
end
end
exports.detect = function(input, log_obj)
process_patterns()
local res = {}
local matches = compiled_patterns:match(input)
local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, res)
local matches = trie:match(input)
if not log_obj then log_obj = rspamd_config end
local last = tlen
local function add_result(match, pattern)
if not res[pattern.ext] then
@ -77,7 +151,7 @@ exports.detect = function(input, log_obj)
end
for npat,matched_positions in pairs(matches) do
local pat_data = processed_patterns[npat]
local pat_data = processed_tbl[npat]
local pattern = pat_data[3]
local match = pat_data[2]
@ -99,6 +173,10 @@ exports.detect = function(input, log_obj)
expected = expected[2]
end
-- Tail match
if expected < 0 then
expected = last + expected + 1
end
return cmp(pos, expected)
end
-- Single position
@ -106,23 +184,39 @@ exports.detect = function(input, log_obj)
local position = match.position
for _,pos in ipairs(matched_positions) do
if match_position(pos, position) then
lua_util.debugm(N, log_obj, 'found match %s at offset %s(from %s)',
pattern.ext, pos, offset)
if match_position(pos + offset, position) then
add_result(match, pattern)
break
end
end
end
-- Match all positions
if match.positions then
local all_right = true
for _,position in ipairs(match.positions) do
local matched = false
for _,pos in ipairs(matched_positions) do
if match_position(pos, position) then
add_result(match, pattern)
if not match_position(pos + offset, position) then
matched = true
break
end
end
if not matched then
all_right = false
break
end
end
if all_right then
add_result(match, pattern)
end
end
end
end
local function process_detected(res)
local extensions = lua_util.keys(res)
if #extensions > 0 then
@ -130,6 +224,72 @@ exports.detect = function(input, log_obj)
return res[ex1] > res[ex2]
end)
return extensions,res[extensions[1]]
end
return nil
end
exports.detect = function(input, log_obj)
if not log_obj then log_obj = rspamd_config end
process_patterns(log_obj)
local res = {}
if type(input) == 'string' then
-- Convert to rspamd_text
input = rspamd_text.fromstring(input)
end
if type(input) == 'userdata' then
local inplen = #input
-- Check tail matches
if inplen > min_tail_offset then
local tail = input:span(inplen - min_tail_offset, min_tail_offset)
match_chunk(tail, inplen, inplen - min_tail_offset,
compiled_tail_patterns, tail_patterns, log_obj, res)
end
-- Try short match
local head = input:span(1, math.min(max_short_offset, inplen))
match_chunk(head, inplen, 0,
compiled_short_patterns, short_patterns, log_obj, res)
-- Check if we have enough data or go to long patterns
local extensions,confidence = process_detected(res)
if extensions and #extensions > 0 and confidence > 30 then
-- We are done on short patterns
return extensions[1],types[extensions[1]]
end
-- No way, let's check data in chunks or just the whole input if it is small enough
if #input > exports.chunk_size * 3 then
-- Chunked version as input is too long
local chunk1, chunk2 =
input:span(1, exports.chunk_size * 2),
input:span(inplen - exports.chunk_size, exports.chunk_size)
local offset1, offset2 = 0, inplen - exports.chunk_size
match_chunk(chunk1, inplen,
offset1, compiled_patterns, processed_patterns, log_obj, res)
match_chunk(chunk2, inplen,
offset2, compiled_patterns, processed_patterns, log_obj, res)
else
-- Input is short enough to match it at all
match_chunk(input, inplen, 0,
compiled_patterns, processed_patterns, log_obj, res)
end
else
-- Table input is NYI
assert(0, 'table input for match')
end
local extensions = process_detected(res)
if extensions and #extensions > 0 then
return extensions[1],types[extensions[1]]
end
@ -137,4 +297,8 @@ exports.detect = function(input, log_obj)
return nil
end
-- This parameter specifies how many bytes are checked in the input
-- Rspamd checks 2 chunks at start and 1 chunk at the end
exports.chunk_size = 32768
return exports

302
lualib/lua_magic/patterns.lua

@ -20,14 +20,12 @@ limitations under the License.
--]]
local patterns = {
{
-- MSDOS extension to match types table
ext = 'pdf',
pdf = {
-- These are alternatives
matches = {
{
string = [[%PDF-\d]],
position = 6, -- must be end of the match, as that's how hyperscan works
position = 6, -- must be end of the match, as that's how hyperscan works (or use relative_position)
weight = 60,
},
{
@ -41,7 +39,301 @@ local patterns = {
weight = 60,
},
},
}
},
ps = {
matches = {
{
string = [[%!PS-Adobe]],
relative_position = 0,
weight = 60,
},
},
},
-- RTF document
rtf = {
matches = {
{
string = [[{\\rtf\d]],
position = 6,
weight = 60,
}
}
},
chm = {
matches = {
{
string = [[ITSF]],
relative_position = 0,
weight = 60,
}
}
},
djvu = {
matches = {
{
string = [[AT&TFORM]],
relative_position = 0,
weight = 60,
},
{
string = [[DJVM]],
relative_position = 0x0c,
weight = 60,
}
}
},
-- MS Exe file
exe = {
matches = {
{
string = [[MZ]],
relative_position = 0,
weight = 10,
},
-- PE part
{
string = [[PE\x{00}\x{00}]],
position = {'>=', 0x3c + 4},
weight = 40,
}
}
},
elf = {
matches = {
{
hex = [[7f454c46]],
relative_position = 0,
weight = 60,
},
}
},
lnk = {
matches = {
{
hex = [[4C0000000114020000000000C000000000000046]],
relative_position = 0,
weight = 60,
},
}
},
class = {
-- Technically, this also matches MachO files, but I don't care about
-- Apple and their mental health problems here: just consider Java files,
-- Mach object files and all other cafe babes as bad and block them!
matches = {
{
hex = [[cafebabe]],
relative_position = 0,
weight = 60,
},
}
},
-- Archives
arj = {
matches = {
{
hex = '60EA',
relative_position = 0,
weight = 60,
},
}
},
ace = {
matches = {
{
string = [[\*\*ACE\*\*]],
position = 14,
weight = 60,
},
}
},
cab = {
matches = {
{
hex = [[4d53434600000000]], -- Can be anywhere for SFX :(
position = {'>=', 8},
weight = 60,
},
}
},
tar = {
matches = {
{
string = [[ustar]],
relative_position = 257,
weight = 60,
},
}
},
bz2 = {
matches = {
{
string = "BZ[h0]",
position = 3,
weight = 60,
},
}
},
lz4 = {
matches = {
{
hex = "184d2204",
relative_position = 0,
weight = 60,
},
{
hex = "184c2103",
relative_position = 0,
weight = 60,
},
{
hex = "184c2102",
relative_position = 0,
weight = 60,
},
}
},
zst = {
matches = {
{
string = [[\x{FD}\x{2F}\x{B5}[\x{22}-\x{40}].]],
position = 5, -- includes last .
weight = 60,
},
}
},
iso = {
matches = {
{
string = [[\x{01}CD001\x{01}]],
position = {'>=', 0x8000 + 7}, -- first 32k is unused
weight = 60,
},
}
},
-- Apple is a 'special' child: this needs to be matched at the data tail...
dmg = {
matches = {
{
string = [[koly]],
position = -512 + 4,
weight = 61,
tail = 512,
},
}
},
szdd = {
matches = {
{
hex = [[535a4444]],
relative_position = 0,
weight = 60,
},
}
},
xz = {
matches = {
{
hex = [[FD377A585A00]],
relative_position = 0,
weight = 60,
},
}
},
-- Images
psd = {
matches = {
{
string = [[8BPS]],
relative_position = 0,
weight = 60,
},
}
},
ico = {
matches = {
{
hex = [[00000100]],
relative_position = 0,
weight = 60,
},
}
},
pcx = {
matches = {
{
hex = [[0A050108]],
relative_position = 0,
weight = 60,
},
}
},
pic = {
matches = {
{
hex = [[FF80C9C71A00]],
relative_position = 0,
weight = 60,
},
}
},
swf = {
matches = {
{
hex = [[5a5753]], -- LZMA
relative_position = 0,
weight = 60,
},
{
hex = [[435753]], -- Zlib
relative_position = 0,
weight = 60,
},
{
hex = [[465753]], -- Uncompressed
relative_position = 0,
weight = 60,
},
}
},
tiff = {
matches = {
{
hex = [[49492a00]], -- LE encoded
relative_position = 0,
weight = 60,
},
{
hex = [[4d4d]], -- BE tiff
relative_position = 0,
weight = 60,
},
}
},
-- Other
pgp = {
matches = {
{
hex = [[A803504750]],
relative_position = 0,
weight = 60,
},
{
hex = [[2D424547494E20504750204D4553534147452D]],
relative_position = 0,
weight = 60,
},
}
},
uue = {
matches = {
{
hex = [[626567696e20]],
relative_position = 0,
weight = 60,
},
}
},
}
return patterns

113
lualib/lua_magic/types.lua

@ -22,18 +22,123 @@ limitations under the License.
-- This table is indexed by msdos extension for convenience
local types = {
-- exe
exe = {
ct = 'application/x-ms-application',
type = 'executable',
},
elf = {
ct = 'application/x-elf-executable',
type = 'executable',
},
lnk = {
ct = 'application/x-ms-application',
type = 'executable',
},
class = {
ct = 'application/x-java-applet',
type = 'executable',
},
-- text
rtf = {
ct = "application/rtf",
type = 'text',
},
pdf = {
ct = 'application/pdf',
type = 'binary',
},
exe = {
ct = 'application/x-ms-application',
type = 'executable',
ps = {
ct = 'application/postscript',
type = 'binary',
},
chm = {
ct = 'application/x-chm',
type = 'binary',
},
djvu = {
ct = 'application/x-djvu',
type = 'binary',
},
-- archives
arj = {
ct = 'application/x-arj',
type = 'archive',
},
cab = {
ct = 'application/x-cab',
type = 'archive',
},
ace = {
ct = 'application/x-ace',
type = 'archive',
},
tar = {
ct = 'application/x-tar',
type = 'archive',
},
bz2 = {
ct = 'application/x-bzip',
type = 'archive',
},
xz = {
ct = 'application/x-xz',
type = 'archive',
},
lz4 = {
ct = 'application/x-lz4',
type = 'archive',
},
zst = {
ct = 'application/x-zstandard',
type = 'archive',
},
dmg = {
ct = 'application/x-dmg',
type = 'archive',
},
iso = {
ct = 'application/x-iso',
type = 'archive',
},
szdd = { -- in fact, their MSDOS extension is like FOO.TX_ or FOO.TX$
ct = 'application/x-compressed',
type = 'archive',
},
-- images
psd = {
ct = 'image/psd',
type = 'image',
},
pcx = {
ct = 'image/pcx',
type = 'image',
},
pic = {
ct = 'image/pic',
type = 'image',
},
tiff = {
ct = 'image/tiff',
type = 'image',
}
},
ico = {
ct = 'image/ico',
type = 'image',
},
swf = {
ct = 'application/x-shockwave-flash',
type = 'image',
},
-- other
pgp = {
ct = 'application/encrypted',
type = 'encrypted'
},
uue = {
ct = 'application/x-uuencoded',
type = 'binary',
},
}
return types

2
src/lua/lua_trie.c

@ -290,7 +290,7 @@ lua_trie_match (lua_State *L)
}
}
else if (lua_type (L, 2) == LUA_TUSERDATA) {
t = lua_check_text (L, -1);
t = lua_check_text (L, 2);
if (t && lua_trie_search_str (L, trie, t->start, t->len, cb)) {
found = TRUE;

3
test/functional/cases/340_surbl.robot

@ -94,7 +94,8 @@ SURBL example.com encoded url in subject
WHITELIST
${result} = Scan Message With Rspamc ${TESTDIR}/messages/whitelist.eml
Should Contain ${result.stdout} RSPAMD_URIBL (
Should Not Contain ${result.stdout} RSPAMD_URIBL (
Should Not Contain ${result.stdout} DBL_SPAM (
*** Keywords ***
Surbl Setup

10
test/functional/configs/plugins.conf

@ -579,6 +579,16 @@ options = {
type = a;
replies = ["127.0.0.4", "127.0.0.11"];
},
{
name = "rspamd-test.com.test.uribl";
type = a;
replies = ["127.0.0.2"];
},
{
name = "rspamd-test.com.test2.uribl";
type = a;
replies = ["127.0.1.2"];
},
{
name = "9.8.8.8.test4.uribl";
type = a;

2
test/functional/messages/whitelist.eml

@ -1,5 +1,3 @@
Content-Type: text/plain
http://rspamd.com
http://test.rspamd.example.com
http://rspamd-test.com
Loading…
Cancel
Save