mirror of https://github.com/rspamd/rspamd.git
Rapid spam filtering system
https://rspamd.com/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.3 KiB
81 lines
2.3 KiB
context("Text tokenization test", function()
|
|
local util = require "rspamd_util"
|
|
local logger = require "rspamd_logger"
|
|
|
|
local cases = {
|
|
{"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
|
|
{"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
|
|
"Integer", "mattis", "nibh"
|
|
}
|
|
},
|
|
{"Հետաքրքրվողների համար ոտորև ներկայացված",
|
|
{"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
|
|
},
|
|
{"", {}},
|
|
{",,,,,", {}},
|
|
{"word,,,,,word ", {"word", "word"}},
|
|
{"word", {"word"}},
|
|
{",,,,word,,,", {"word"}}
|
|
}
|
|
|
|
for i,c in ipairs(cases) do
|
|
test("Tokenize simple " .. i, function()
|
|
local w = util.tokenize_text(c[1])
|
|
if #c[2] == 0 then
|
|
assert_equal(#w, 0, "must not have tokens " .. c[1])
|
|
else
|
|
assert_not_nil(w, "must tokenize " .. c[1])
|
|
|
|
for i,wrd in ipairs(w) do
|
|
assert_equal(wrd, c[2][i])
|
|
end
|
|
end
|
|
end)
|
|
end
|
|
|
|
cases = {
|
|
{"word https://example.com/path word",
|
|
{{5, 24}},
|
|
{"word", "!!EX!!", "word"}
|
|
},
|
|
{"համար https://example.com/path համար",
|
|
{{11, 24}},
|
|
{"համար", "!!EX!!", "համար"}
|
|
},
|
|
{"word https://example.com/path https://example.com/path word",
|
|
{{5, 24}, {30, 24}},
|
|
{"word", "!!EX!!", "!!EX!!", "word"}
|
|
},
|
|
{"word https://example.com/path https://example.com/path",
|
|
{{5, 24}, {30, 24}},
|
|
{"word", "!!EX!!", "!!EX!!"}
|
|
},
|
|
{"https://example.com/path https://example.com/path word",
|
|
{{0, 24}, {25, 24}},
|
|
{"!!EX!!", "!!EX!!", "word"}
|
|
},
|
|
{"https://example.com/path https://example.com/path",
|
|
{{0, 24}, {25, 24}},
|
|
{"!!EX!!", "!!EX!!"}
|
|
},
|
|
{",,,,https://example.com/path https://example.com/path ",
|
|
{{4, 24}, {29, 24}},
|
|
{"!!EX!!", "!!EX!!"}
|
|
},
|
|
}
|
|
|
|
for i,c in ipairs(cases) do
|
|
test("Tokenize with exceptions " .. i, function()
|
|
local w = util.tokenize_text(c[1], c[2])
|
|
if #c[3] == 0 then
|
|
assert_equal(#w, 0, "must not have tokens " .. c[1])
|
|
else
|
|
assert_not_nil(w, "must tokenize " .. c[1])
|
|
for i,wrd in ipairs(w) do
|
|
assert_equal(wrd, c[3][i])
|
|
end
|
|
end
|
|
end)
|
|
end
|
|
|
|
end)
|