mirror of https://github.com/rspamd/rspamd.git
Rapid spam filtering system
https://rspamd.com/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
207 lines
6.3 KiB
207 lines
6.3 KiB
-- Test utf routines
|
|
|
|
context("UTF8 check functions", function()
|
|
local ffi = require("ffi")
|
|
ffi.cdef[[
|
|
unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
|
|
unsigned int rspamd_str_lc (char *str, unsigned int size);
|
|
void rspamd_fast_utf8_library_init (unsigned flags);
|
|
void ottery_rand_bytes(void *buf, size_t n);
|
|
double rspamd_get_ticks(int allow);
|
|
size_t rspamd_fast_utf8_validate (const unsigned char *data, size_t len);
|
|
size_t rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len);
|
|
size_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
|
|
size_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
|
|
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *);
|
|
]]
|
|
|
|
local cases = {
|
|
{"АбЫрвАлг", "абырвалг"},
|
|
{"АAБBвc", "аaбbвc"},
|
|
--{"STRASSE", "straße"}, XXX: NYI
|
|
{"KEÇİ", "keçi"},
|
|
}
|
|
|
|
for i,c in ipairs(cases) do
|
|
test("UTF lowercase " .. tostring(i), function()
|
|
local buf = ffi.new("char[?]", #c[1] + 1)
|
|
ffi.copy(buf, c[1])
|
|
local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1])
|
|
local s = ffi.string(buf, nlen)
|
|
assert_equal(s, c[2])
|
|
end)
|
|
end
|
|
|
|
cases = {
|
|
{"AbCdEf", "abcdef"},
|
|
{"A", "a"},
|
|
{"AaAa", "aaaa"},
|
|
{"AaAaAaAa", "aaaaaaaa"}
|
|
}
|
|
|
|
for i,c in ipairs(cases) do
|
|
test("ASCII lowercase " .. tostring(i), function()
|
|
local buf = ffi.new("char[?]", #c[1] + 1)
|
|
ffi.copy(buf, c[1])
|
|
ffi.C.rspamd_str_lc(buf, #c[1])
|
|
local s = ffi.string(buf)
|
|
assert_equal(s, c[2])
|
|
end)
|
|
end
|
|
|
|
cases = {
|
|
{'тест', 'тест'},
|
|
{'\200\213\202', '���'},
|
|
{'тест\200\213\202test', 'тест���test'},
|
|
{'\200\213\202test', '���test'},
|
|
{'\200\213\202test\200\213\202', '���test���'},
|
|
{'тест\200\213\202test\200\213\202', 'тест���test���'},
|
|
{'тест\200\213\202test\200\213\202тест', 'тест���test���тест'},
|
|
}
|
|
|
|
local NULL = ffi.new 'void*'
|
|
for i,c in ipairs(cases) do
|
|
test("Unicode make valid " .. tostring(i), function()
|
|
local buf = ffi.new("char[?]", #c[1] + 1)
|
|
ffi.copy(buf, c[1])
|
|
|
|
local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL, NULL))
|
|
local function to_hex(s)
|
|
return (s:gsub('.', function (c)
|
|
return string.format('%02X', string.byte(c))
|
|
end))
|
|
end
|
|
print(to_hex(s))
|
|
print(to_hex(c[2]))
|
|
assert_equal(s, c[2])
|
|
end)
|
|
end
|
|
|
|
-- Enable sse and avx2
|
|
ffi.C.rspamd_fast_utf8_library_init(3)
|
|
local valid_cases = {
|
|
"a",
|
|
"\xc3\xb1",
|
|
"\xe2\x82\xa1",
|
|
"\xf0\x90\x8c\xbc",
|
|
"안녕하세요, 세상"
|
|
}
|
|
for i,c in ipairs(valid_cases) do
|
|
test("Unicode validate success: " .. tostring(i), function()
|
|
local buf = ffi.new("char[?]", #c + 1)
|
|
ffi.copy(buf, c)
|
|
|
|
local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
|
|
assert_equal(ret, 0)
|
|
end)
|
|
end
|
|
local invalid_cases = {
|
|
"\xc3\x28",
|
|
"\xa0\xa1",
|
|
"\xe2\x28\xa1",
|
|
"\xe2\x82\x28",
|
|
"\xf0\x28\x8c\xbc",
|
|
"\xf0\x90\x28\xbc",
|
|
"\xf0\x28\x8c\x28",
|
|
"\xc0\x9f",
|
|
"\xf5\xff\xff\xff",
|
|
"\xed\xa0\x81",
|
|
"\xf8\x90\x80\x80\x80",
|
|
"123456789012345\xed",
|
|
"123456789012345\xf1",
|
|
"123456789012345\xc2",
|
|
"\xC2\x7F"
|
|
}
|
|
for i,c in ipairs(invalid_cases) do
|
|
test("Unicode validate fail: " .. tostring(i), function()
|
|
local buf = ffi.new("char[?]", #c + 1)
|
|
ffi.copy(buf, c)
|
|
|
|
local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
|
|
assert_not_equal(ret, 0)
|
|
end)
|
|
end
|
|
|
|
if os.getenv("RSPAMD_LUA_EXPENSIVE_TESTS") then
|
|
local speed_iters = 10000
|
|
local function test_size(buflen, is_valid, impl)
|
|
local logger = require "rspamd_logger"
|
|
local test_str
|
|
if is_valid then
|
|
test_str = table.concat(valid_cases)
|
|
else
|
|
test_str = table.concat(valid_cases) .. table.concat(invalid_cases)
|
|
end
|
|
|
|
local buf = ffi.new("char[?]", buflen)
|
|
if #test_str < buflen then
|
|
local t = {}
|
|
local len = #test_str
|
|
while len < buflen do
|
|
t[#t + 1] = test_str
|
|
len = len + #test_str
|
|
end
|
|
test_str = table.concat(t)
|
|
end
|
|
ffi.copy(buf, test_str:sub(1, buflen))
|
|
|
|
local tm = 0
|
|
|
|
for _=1,speed_iters do
|
|
if impl == 'ref' then
|
|
local t1 = ffi.C.rspamd_get_ticks(1)
|
|
ffi.C.rspamd_fast_utf8_validate_ref(buf, buflen)
|
|
local t2 = ffi.C.rspamd_get_ticks(1)
|
|
tm = tm + (t2 - t1)
|
|
elseif impl == 'sse' then
|
|
local t1 = ffi.C.rspamd_get_ticks(1)
|
|
ffi.C.rspamd_fast_utf8_validate_sse41(buf, buflen)
|
|
local t2 = ffi.C.rspamd_get_ticks(1)
|
|
tm = tm + (t2 - t1)
|
|
else
|
|
local t1 = ffi.C.rspamd_get_ticks(1)
|
|
ffi.C.rspamd_fast_utf8_validate_avx2(buf, buflen)
|
|
local t2 = ffi.C.rspamd_get_ticks(1)
|
|
tm = tm + (t2 - t1)
|
|
end
|
|
end
|
|
|
|
logger.messagex("%s utf8 %s check (valid = %s): %s ticks per iter, %s ticks per byte",
|
|
impl, buflen, is_valid,
|
|
tm / speed_iters, tm / speed_iters / buflen)
|
|
|
|
return 0
|
|
end
|
|
|
|
for _,sz in ipairs({78, 512, 65535}) do
|
|
test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'valid'), function()
|
|
local res = test_size(sz, true, 'ref')
|
|
assert_equal(res, 0)
|
|
end)
|
|
test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'invalid'), function()
|
|
local res = test_size(sz, false, 'ref')
|
|
assert_equal(res, 0)
|
|
end)
|
|
|
|
if jit.arch == 'x64' then
|
|
test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'valid'), function()
|
|
local res = test_size(sz, true, 'sse')
|
|
assert_equal(res, 0)
|
|
end)
|
|
test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'invalid'), function()
|
|
local res = test_size(sz, false, 'sse')
|
|
assert_equal(res, 0)
|
|
end)
|
|
test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'valid'), function()
|
|
local res = test_size(sz, true, 'avx2')
|
|
assert_equal(res, 0)
|
|
end)
|
|
test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'invalid'), function()
|
|
local res = test_size(sz, false, 'avx2')
|
|
assert_equal(res, 0)
|
|
end)
|
|
end
|
|
end
|
|
end
|
|
|
|
end)
|