rspamd/test/lua/unit/tokenizer.lua


								context("Text tokenization test", function()

								  local util = require "rspamd_util"

								  local logger = require "rspamd_logger"


								  local cases = {

								    {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",

								     {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",

								      "Integer", "mattis", "nibh"

								     }

								    },

								    {"Հետաքրքրվողների համար ոտորև ներկայացված",

								     {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}

								    },

								    {"", {}},

								    {",,,,,", {}},

								    {"word,,,,,word    ", {"word", "word"}},

								    {"word", {"word"}},

								    {",,,,word,,,", {"word"}}

								  }


								  for i,c in ipairs(cases) do

								    test("Tokenize simple " .. i, function()

								      local w = util.tokenize_text(c[1])

								      if #c[2] == 0 then

								        assert_equal(#w, 0, "must not have tokens " .. c[1])

								      else

								        assert_not_nil(w, "must tokenize " .. c[1])


								        for i,wrd in ipairs(w) do

								          assert_equal(wrd, c[2][i])

								        end

								      end

								    end)

								  end


								  cases = {

								    {"word https://example.com/path word",

								     {{5, 24}},

								     {"word", "!!EX!!", "word"}

								    },

								    {"համար https://example.com/path համար",

								     {{11, 24}},

								     {"համար", "!!EX!!", "համար"}

								    },

								    {"word https://example.com/path https://example.com/path word",

								     {{5, 24}, {30, 24}},

								     {"word", "!!EX!!", "!!EX!!", "word"}

								    },

								    {"word https://example.com/path https://example.com/path",

								     {{5, 24}, {30, 24}},

								     {"word", "!!EX!!", "!!EX!!"}

								    },

								    {"https://example.com/path https://example.com/path word",

								     {{0, 24}, {25, 24}},

								     {"!!EX!!", "!!EX!!", "word"}

								    },

								    {"https://example.com/path https://example.com/path",

								     {{0, 24}, {25, 24}},

								     {"!!EX!!", "!!EX!!"}

								    },

								    {",,,,https://example.com/path https://example.com/path    ",

								     {{4, 24}, {29, 24}},

								     {"!!EX!!", "!!EX!!"}

								    },

								  }


								  for i,c in ipairs(cases) do

								    test("Tokenize with exceptions " .. i, function()

								      local w = util.tokenize_text(c[1], c[2])

								      if #c[3] == 0 then

								        assert_equal(#w, 0, "must not have tokens " .. c[1])

								      else

								        assert_not_nil(w, "must tokenize " .. c[1])

								        for i,wrd in ipairs(w) do

								          assert_equal(wrd, c[3][i])

								        end

								      end

								    end)

								  end


								end)