Browse Source

[Rework] Improve bayes debug logging, remove unused stuff

pull/2635/head
Vsevolod Stakhov 7 years ago
parent
commit
86bf209292
  1. 5
      lualib/lua_stat.lua
  2. 11
      src/libstat/classifiers/bayes.c
  3. 38
      src/libstat/classifiers/classifiers.h
  4. 15
      src/libstat/classifiers/lua_classifier.c
  5. 11
      src/libstat/stat_config.c
  6. 1
      src/libstat/stat_internal.h
  7. 140
      src/libstat/stat_process.c
  8. 6
      src/libutil/logger.h

5
lualib/lua_stat.lua

@ -14,6 +14,11 @@ See the License for the specific language governing permissions and
limitations under the License.
]]--
--[[[
-- @module lua_stat
-- This module contains helper functions for supporting statistics
--]]
local logger = require "rspamd_logger"
local sqlite3 = require "rspamd_sqlite3"
local util = require "rspamd_util"

11
src/libstat/classifiers/bayes.c

@ -38,7 +38,7 @@
G_STRFUNC, \
__VA_ARGS__)
INIT_LOG_MODULE(bayes)
INIT_LOG_MODULE_PUBLIC(bayes)
static inline GQuark
bayes_error_quark (void)
@ -254,13 +254,20 @@ bayes_classify_token (struct rspamd_classifier *ctx,
gboolean
bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier *cl)
bayes_init (struct rspamd_config *cfg,
struct event_base *ev_base,
struct rspamd_classifier *cl)
{
cl->cfg->flags |= RSPAMD_FLAG_CLASSIFIER_INTEGER;
return TRUE;
}
void
bayes_fin (struct rspamd_classifier *cl)
{
}
gboolean
bayes_classify (struct rspamd_classifier * ctx,
GPtrArray *tokens,

38
src/libstat/classifiers/classifiers.h

@ -3,6 +3,7 @@
#include "config.h"
#include "mem_pool.h"
#include <event.h>
#define RSPAMD_DEFAULT_CLASSIFIER "bayes"
/* Consider this value as 0 */
@ -10,28 +11,32 @@
struct rspamd_classifier_config;
struct rspamd_task;
struct rspamd_config;
struct rspamd_classifier;
struct token_node_s;
struct rspamd_stat_classifier {
char *name;
gboolean (*init_func)(rspamd_mempool_t *pool,
struct rspamd_classifier *cl);
gboolean (*init_func)(struct rspamd_config *cfg,
struct event_base *ev_base,
struct rspamd_classifier *cl);
gboolean (*classify_func)(struct rspamd_classifier * ctx,
GPtrArray *tokens,
struct rspamd_task *task);
GPtrArray *tokens,
struct rspamd_task *task);
gboolean (*learn_spam_func)(struct rspamd_classifier * ctx,
GPtrArray *input,
struct rspamd_task *task,
gboolean is_spam,
gboolean unlearn,
GError **err);
GPtrArray *input,
struct rspamd_task *task,
gboolean is_spam,
gboolean unlearn,
GError **err);
void (*fin_func)(struct rspamd_classifier *cl);
};
/* Bayes algorithm */
gboolean bayes_init (rspamd_mempool_t *pool,
struct rspamd_classifier *);
gboolean bayes_init (struct rspamd_config *cfg,
struct event_base *ev_base,
struct rspamd_classifier *);
gboolean bayes_classify (struct rspamd_classifier *ctx,
GPtrArray *tokens,
struct rspamd_task *task);
@ -41,10 +46,12 @@ gboolean bayes_learn_spam (struct rspamd_classifier *ctx,
gboolean is_spam,
gboolean unlearn,
GError **err);
void bayes_fin (struct rspamd_classifier *);
/* Generic lua classifier */
gboolean lua_classifier_init (rspamd_mempool_t *pool,
struct rspamd_classifier *);
gboolean lua_classifier_init (struct rspamd_config *cfg,
struct event_base *ev_base,
struct rspamd_classifier *);
gboolean lua_classifier_classify (struct rspamd_classifier *ctx,
GPtrArray *tokens,
struct rspamd_task *task);
@ -55,6 +62,11 @@ gboolean lua_classifier_learn_spam (struct rspamd_classifier *ctx,
gboolean unlearn,
GError **err);
extern guint rspamd_bayes_log_id;
#define msg_debug_bayes(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
rspamd_bayes_log_id, "bayes", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
#endif
/*

15
src/libstat/classifiers/lua_classifier.c

@ -47,8 +47,9 @@ static GHashTable *lua_classifiers = NULL;
INIT_LOG_MODULE(luacl)
gboolean
lua_classifier_init (rspamd_mempool_t *pool,
struct rspamd_classifier *cl)
lua_classifier_init (struct rspamd_config *cfg,
struct event_base *ev_base,
struct rspamd_classifier *cl)
{
struct rspamd_lua_classifier_ctx *ctx;
lua_State *L = cl->ctx->cfg->lua_state;
@ -62,7 +63,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name);
if (ctx != NULL) {
msg_err_pool ("duplicate lua classifier definition: %s",
msg_err_config ("duplicate lua classifier definition: %s",
cl->subrs->name);
return FALSE;
@ -70,7 +71,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
lua_getglobal (L, "rspamd_classifiers");
if (lua_type (L, -1) != LUA_TTABLE) {
msg_err_pool ("cannot register classifier %s: no rspamd_classifier global",
msg_err_config ("cannot register classifier %s: no rspamd_classifier global",
cl->subrs->name);
lua_pop (L, 1);
@ -81,7 +82,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
lua_gettable (L, -2);
if (lua_type (L, -1) != LUA_TTABLE) {
msg_err_pool ("cannot register classifier %s: bad lua type: %s",
msg_err_config ("cannot register classifier %s: bad lua type: %s",
cl->subrs->name, lua_typename (L, lua_type (L, -1)));
lua_pop (L, 2);
@ -92,7 +93,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
lua_gettable (L, -2);
if (lua_type (L, -1) != LUA_TFUNCTION) {
msg_err_pool ("cannot register classifier %s: bad lua type for classify: %s",
msg_err_config ("cannot register classifier %s: bad lua type for classify: %s",
cl->subrs->name, lua_typename (L, lua_type (L, -1)));
lua_pop (L, 3);
@ -105,7 +106,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
lua_gettable (L, -2);
if (lua_type (L, -1) != LUA_TFUNCTION) {
msg_err_pool ("cannot register classifier %s: bad lua type for learn: %s",
msg_err_config ("cannot register classifier %s: bad lua type for learn: %s",
cl->subrs->name, lua_typename (L, lua_type (L, -1)));
lua_pop (L, 3);

11
src/libstat/stat_config.c

@ -28,6 +28,7 @@ static struct rspamd_stat_classifier lua_classifier = {
.init_func = lua_classifier_init,
.classify_func = lua_classifier_classify,
.learn_spam_func = lua_classifier_learn_spam,
.fin_func = NULL,
};
static struct rspamd_stat_classifier stat_classifiers[] = {
@ -36,6 +37,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = {
.init_func = bayes_init,
.classify_func = bayes_classify,
.learn_spam_func = bayes_learn_spam,
.fin_func = bayes_fin,
}
};
@ -182,7 +184,7 @@ rspamd_stat_init (struct rspamd_config *cfg, struct event_base *ev_base)
continue;
}
if (!cl->subrs->init_func (cfg->cfg_pool, cl)) {
if (!cl->subrs->init_func (cfg, ev_base, cl)) {
g_free (cl);
msg_err_config ("cannot init classifier type %s", clf->name);
cur = g_list_next (cur);
@ -328,6 +330,11 @@ rspamd_stat_close (void)
}
g_array_free (cl->statfiles_ids, TRUE);
if (cl->subrs->fin_func) {
cl->subrs->fin_func (cl);
}
g_free (cl);
}
@ -475,11 +482,11 @@ rspamd_stat_ctx_register_async (rspamd_stat_async_handler handler,
g_assert (st_ctx != NULL);
elt = g_malloc0 (sizeof (*elt));
REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
elt->handler = handler;
elt->cleanup = cleanup;
elt->ud = d;
elt->timeout = timeout;
REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
/* Enabled by default */

1
src/libstat/stat_internal.h

@ -41,6 +41,7 @@ struct rspamd_classifier {
gulong ham_learns;
struct rspamd_classifier_config *cfg;
struct rspamd_stat_classifier *subrs;
gpointer specific;
};
struct rspamd_statfile {

140
src/libstat/stat_process.c

@ -63,7 +63,7 @@ rspamd_stat_tokenize_header (struct rspamd_task *task,
}
}
msg_debug_task ("added stat tokens for header '%s'", name);
msg_debug_bayes ("added stat tokens for header '%s'", name);
}
}
@ -114,7 +114,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
g_array_append_val (ar, elt);
}
msg_debug_task ("added stat tokens for image '%s'", img->html_image->src);
msg_debug_bayes ("added stat tokens for image '%s'", img->html_image->src);
}
}
else if (part->cd && part->cd->filename.len > 0) {
@ -133,7 +133,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
elt.len = part->ct->boundary.len;
if (elt.len) {
msg_debug_task ("added stat tokens for mime boundary '%*s'",
msg_debug_bayes ("added stat tokens for mime boundary '%*s'",
(gint)elt.len, elt.begin);
g_array_append_val (ar, elt);
}
@ -155,13 +155,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
if (tp->language != NULL && tp->language[0] != '\0') {
elt.begin = (gchar *)tp->language;
elt.len = strlen (elt.begin);
msg_debug_task ("added stat tokens for part language '%s'", elt.begin);
msg_debug_bayes ("added stat tokens for part language '%s'", elt.begin);
g_array_append_val (ar, elt);
}
if (tp->real_charset != NULL) {
elt.begin = (gchar *)tp->real_charset;
elt.len = strlen (elt.begin);
msg_debug_task ("added stat tokens for part charset '%s'", elt.begin);
msg_debug_bayes ("added stat tokens for part charset '%s'", elt.begin);
g_array_append_val (ar, elt);
}
}
@ -184,124 +184,6 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
g_array_append_val (ar, elt);
}
/* Use more precise headers order */
#if 0
cur = g_list_first (task->headers_order->head);
while (cur) {
hdr = cur->data;
if (hdr->name && hdr->type != RSPAMD_HEADER_RECEIVED) {
elt.begin = hdr->name;
elt.len = strlen (hdr->name);
g_array_append_val (ar, elt);
}
cur = g_list_next (cur);
}
#endif
/* Use metatokens plugin from Lua */
lua_getglobal (L, "rspamd_plugins");
if (lua_type (L, -1) == LUA_TTABLE) {
lua_pushstring (L, "stat_metatokens");
lua_gettable (L, -2);
if (lua_type (L, -1) == LUA_TTABLE) {
gint old_top;
old_top = lua_gettop (L);
lua_pushstring (L, "callback");
lua_gettable (L, -2);
if (lua_type (L, -1) == LUA_TFUNCTION) {
struct rspamd_task **ptask;
ptask = lua_newuserdata (L, sizeof (*ptask));
rspamd_lua_setclass (L, "rspamd{task}", -1);
*ptask = task;
if (lua_pcall (L, 1, LUA_MULTRET, 0) != 0) {
msg_err_task ("stat_metatokens failed: %s",
lua_tostring (L, -1));
lua_pop (L, 1);
} else {
if (lua_gettop (L) > old_top &&
lua_istable (L, old_top + 1)) {
lua_pushvalue (L, old_top + 1);
/* Iterate over table of tables */
for (lua_pushnil (L); lua_next (L, -2);
lua_pop (L, 1)) {
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META|
RSPAMD_STAT_TOKEN_FLAG_LUA_META;
if (lua_isnumber (L, -1)) {
gdouble num = lua_tonumber (L, -1);
guint8 *pnum = rspamd_mempool_alloc (
task->task_pool,
sizeof (num));
msg_debug_task ("got metatoken number: %.2f",
num);
memcpy (pnum, &num, sizeof (num));
elt.begin = (gchar *) pnum;
elt.len = sizeof (num);
g_array_append_val (ar, elt);
} else if (lua_isstring (L, -1)) {
const gchar *str;
gsize tlen;
str = lua_tolstring (L, -1, &tlen);
guint8 *pstr = rspamd_mempool_alloc (
task->task_pool,
tlen);
memcpy (pstr, str, tlen);
msg_debug_task ("got metatoken string: %*s",
(gint) tlen, str);
elt.begin = (gchar *) pstr;
elt.len = tlen;
g_array_append_val (ar, elt);
}
else if (lua_istable (L, -1)) {
/* Treat that as unigramms */
for (lua_pushnil (L); lua_next (L, -2);
lua_pop (L, 1)) {
if (lua_isstring (L, -1)) {
const gchar *str;
gsize tlen;
str = lua_tolstring (L, -1, &tlen);
guint8 *pstr = rspamd_mempool_alloc (
task->task_pool,
tlen);
memcpy (pstr, str, tlen);
msg_debug_task ("got unigramm "
"metatoken string: %*s",
(gint) tlen, str);
elt.begin = (gchar *) pstr;
elt.len = tlen;
elt.flags |= RSPAMD_STAT_TOKEN_FLAG_UNIGRAM;
g_array_append_val (ar, elt);
}
}
}
}
}
}
}
}
}
lua_settop (L, 0);
st_ctx->tokenizer->tokenize_func (st_ctx,
task,
ar,
TRUE,
"META:",
task->tokens);
rspamd_mempool_add_destructor (task->task_pool,
rspamd_array_free_hard, ar);
}
@ -354,7 +236,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
msg_debug_task ("message has two common parts (%.2f), so skip the last one",
msg_debug_bayes ("message has two common parts (%.2f), so skip the last one",
*pdiff);
break;
}
@ -425,7 +307,7 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
if (!rspamd_symcache_is_symbol_enabled (task, task->cfg->cache,
st->stcf->symbol)) {
g_ptr_array_index (task->stat_runtimes, i) = NULL;
msg_debug_task ("symbol %s is disabled, skip classification",
msg_debug_bayes ("symbol %s is disabled, skip classification",
st->stcf->symbol);
continue;
}
@ -574,7 +456,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
if (bk_run == NULL) {
skip = TRUE;
msg_debug_task ("disable classifier %s as statfile symbol %s is disabled",
msg_debug_bayes ("disable classifier %s as statfile symbol %s is disabled",
cl->cfg->name, st->stcf->symbol);
break;
}
@ -583,7 +465,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
if (!skip) {
if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) {
msg_debug_task (
msg_debug_bayes (
"<%s> contains less tokens than required for %s classifier: "
"%ud < %ud",
task->message_id,
@ -593,7 +475,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
continue;
}
else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) {
msg_debug_task (
msg_debug_bayes (
"<%s> contains more tokens than allowed for %s classifier: "
"%ud > %ud",
task->message_id,
@ -1090,7 +972,7 @@ rspamd_stat_has_classifier_symbols (struct rspamd_task *task,
if (rspamd_task_find_symbol_result (task, st->stcf->symbol)) {
if (is_spam == !!st->stcf->is_spam) {
msg_debug_task ("do not autolearn %s as symbol %s is already "
msg_debug_bayes ("do not autolearn %s as symbol %s is already "
"added", is_spam ? "spam" : "ham", st->stcf->symbol);
return TRUE;

6
src/libutil/logger.h

@ -111,6 +111,12 @@ guint rspamd_logger_add_debug_module (const gchar *mod);
rspamd_##mname##_log_id = rspamd_logger_add_debug_module(#mname); \
}
#define INIT_LOG_MODULE_PUBLIC(mname) \
guint rspamd_##mname##_log_id = (guint)-1; \
RSPAMD_CONSTRUCTOR(rspamd_##mname##_log_init) { \
rspamd_##mname##_log_id = rspamd_logger_add_debug_module(#mname); \
}
void rspamd_logger_configure_modules (GHashTable *mods_enabled);
/**

Loading…
Cancel
Save