Rapid spam filtering system https://rspamd.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

207 lines
6.3 KiB

-- Test utf routines
context("UTF8 check functions", function()
local ffi = require("ffi")
ffi.cdef[[
unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
unsigned int rspamd_str_lc (char *str, unsigned int size);
void rspamd_fast_utf8_library_init (unsigned flags);
void ottery_rand_bytes(void *buf, size_t n);
double rspamd_get_ticks(int allow);
size_t rspamd_fast_utf8_validate (const unsigned char *data, size_t len);
size_t rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len);
size_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
size_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *);
]]
local cases = {
{"АбЫрвАлг", "абырвалг"},
{"АAБBвc", "аaбbвc"},
--{"STRASSE", "straße"}, XXX: NYI
{"KEÇİ", "keçi"},
}
for i,c in ipairs(cases) do
test("UTF lowercase " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1])
local s = ffi.string(buf, nlen)
assert_equal(s, c[2])
end)
end
cases = {
{"AbCdEf", "abcdef"},
{"A", "a"},
{"AaAa", "aaaa"},
{"AaAaAaAa", "aaaaaaaa"}
}
for i,c in ipairs(cases) do
test("ASCII lowercase " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
ffi.C.rspamd_str_lc(buf, #c[1])
local s = ffi.string(buf)
assert_equal(s, c[2])
end)
end
cases = {
{'тест', 'тест'},
{'\200\213\202', '���'},
{'тест\200\213\202test', 'тест���test'},
{'\200\213\202test', '���test'},
{'\200\213\202test\200\213\202', '���test���'},
{'тест\200\213\202test\200\213\202', 'тест���test���'},
{'тест\200\213\202test\200\213\202тест', 'тест���test���тест'},
}
local NULL = ffi.new 'void*'
for i,c in ipairs(cases) do
test("Unicode make valid " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL, NULL))
local function to_hex(s)
return (s:gsub('.', function (c)
return string.format('%02X', string.byte(c))
end))
end
print(to_hex(s))
print(to_hex(c[2]))
assert_equal(s, c[2])
end)
end
-- Enable sse and avx2
ffi.C.rspamd_fast_utf8_library_init(3)
local valid_cases = {
"a",
"\xc3\xb1",
"\xe2\x82\xa1",
"\xf0\x90\x8c\xbc",
"안녕하세요, 세상"
}
for i,c in ipairs(valid_cases) do
test("Unicode validate success: " .. tostring(i), function()
local buf = ffi.new("char[?]", #c + 1)
ffi.copy(buf, c)
local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
assert_equal(ret, 0)
end)
end
local invalid_cases = {
"\xc3\x28",
"\xa0\xa1",
"\xe2\x28\xa1",
"\xe2\x82\x28",
"\xf0\x28\x8c\xbc",
"\xf0\x90\x28\xbc",
"\xf0\x28\x8c\x28",
"\xc0\x9f",
"\xf5\xff\xff\xff",
"\xed\xa0\x81",
"\xf8\x90\x80\x80\x80",
"123456789012345\xed",
"123456789012345\xf1",
"123456789012345\xc2",
"\xC2\x7F"
}
for i,c in ipairs(invalid_cases) do
test("Unicode validate fail: " .. tostring(i), function()
local buf = ffi.new("char[?]", #c + 1)
ffi.copy(buf, c)
local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
assert_not_equal(ret, 0)
end)
end
if os.getenv("RSPAMD_LUA_EXPENSIVE_TESTS") then
local speed_iters = 10000
local function test_size(buflen, is_valid, impl)
local logger = require "rspamd_logger"
local test_str
if is_valid then
test_str = table.concat(valid_cases)
else
test_str = table.concat(valid_cases) .. table.concat(invalid_cases)
end
local buf = ffi.new("char[?]", buflen)
if #test_str < buflen then
local t = {}
local len = #test_str
while len < buflen do
t[#t + 1] = test_str
len = len + #test_str
end
test_str = table.concat(t)
end
ffi.copy(buf, test_str:sub(1, buflen))
local tm = 0
for _=1,speed_iters do
if impl == 'ref' then
local t1 = ffi.C.rspamd_get_ticks(1)
ffi.C.rspamd_fast_utf8_validate_ref(buf, buflen)
local t2 = ffi.C.rspamd_get_ticks(1)
tm = tm + (t2 - t1)
elseif impl == 'sse' then
local t1 = ffi.C.rspamd_get_ticks(1)
ffi.C.rspamd_fast_utf8_validate_sse41(buf, buflen)
local t2 = ffi.C.rspamd_get_ticks(1)
tm = tm + (t2 - t1)
else
local t1 = ffi.C.rspamd_get_ticks(1)
ffi.C.rspamd_fast_utf8_validate_avx2(buf, buflen)
local t2 = ffi.C.rspamd_get_ticks(1)
tm = tm + (t2 - t1)
end
end
logger.messagex("%s utf8 %s check (valid = %s): %s ticks per iter, %s ticks per byte",
impl, buflen, is_valid,
tm / speed_iters, tm / speed_iters / buflen)
return 0
end
for _,sz in ipairs({78, 512, 65535}) do
test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'valid'), function()
local res = test_size(sz, true, 'ref')
assert_equal(res, 0)
end)
test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'invalid'), function()
local res = test_size(sz, false, 'ref')
assert_equal(res, 0)
end)
if jit.arch == 'x64' then
test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'valid'), function()
local res = test_size(sz, true, 'sse')
assert_equal(res, 0)
end)
test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'invalid'), function()
local res = test_size(sz, false, 'sse')
assert_equal(res, 0)
end)
test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'valid'), function()
local res = test_size(sz, true, 'avx2')
assert_equal(res, 0)
end)
test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'invalid'), function()
local res = test_size(sz, false, 'avx2')
assert_equal(res, 0)
end)
end
end
end
end)