mirror of https://github.com/rspamd/rspamd.git
Rapid spam filtering system
https://rspamd.com/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
76 lines
2.3 KiB
76 lines
2.3 KiB
--[[
|
|
Common helpers for building LLM input content from a task
|
|
]] --
|
|
|
|
local lua_util = require "lua_util"
|
|
local lua_mime = require "lua_mime"
|
|
local fun = require "fun"
|
|
|
|
local M = {}
|
|
local N = 'llm_common'
|
|
|
|
local function get_meta_llm_content(task)
|
|
local url_content = "Url domains: no urls found"
|
|
if task:has_urls() then
|
|
local urls = lua_util.extract_specific_urls { task = task, limit = 5, esld_limit = 1 }
|
|
url_content = "Url domains: " .. table.concat(fun.totable(fun.map(function(u)
|
|
return u:get_tld() or ''
|
|
end, urls or {})), ', ')
|
|
end
|
|
|
|
local from_or_empty = ((task:get_from('mime') or {})[1] or {})
|
|
local from_name = from_or_empty.name or ''
|
|
local from_addr = from_or_empty.addr or ''
|
|
local from_content = string.format('From: %s <%s>', from_name, from_addr)
|
|
|
|
return url_content, from_content
|
|
end
|
|
|
|
-- Build structured payload suitable for LLM embeddings and chat
|
|
-- Returns: table { subject = <string>, from = <string>, url_domains = <string>, text = <rspamd_text|string> }, part
|
|
function M.build_llm_input(task, opts)
|
|
opts = opts or {}
|
|
local subject = task:get_subject() or ''
|
|
local url_content, from_content = get_meta_llm_content(task)
|
|
|
|
local sel_part = lua_mime.get_displayed_text_part(task)
|
|
if not sel_part then
|
|
lua_util.debugm(N, task, 'no displayed text part found')
|
|
return nil, nil
|
|
end
|
|
|
|
local nwords = sel_part:get_words_count() or 0
|
|
if nwords < 5 then
|
|
lua_util.debugm(N, task, 'too few words in part: %s', nwords)
|
|
return nil, sel_part
|
|
end
|
|
|
|
local max_tokens = tonumber(opts.max_tokens) or 1024
|
|
local text
|
|
if nwords > max_tokens then
|
|
local words = sel_part:get_words('norm') or {}
|
|
if #words > max_tokens then
|
|
text = table.concat(words, ' ', 1, max_tokens)
|
|
else
|
|
text = table.concat(words, ' ')
|
|
end
|
|
lua_util.debugm(N, task, 'truncated text to %s tokens (had %s words)', max_tokens, nwords)
|
|
else
|
|
-- Keep rspamd_text (userdata) intact; consumers (http/ucl) can use it directly
|
|
text = sel_part:get_content_oneline() or ''
|
|
end
|
|
|
|
return {
|
|
subject = subject,
|
|
from = from_content,
|
|
url_domains = url_content,
|
|
text = text,
|
|
}, sel_part
|
|
end
|
|
|
|
-- Backwards-compat alias
|
|
M.build_embedding_input = M.build_llm_input
|
|
|
|
M.get_meta_llm_content = get_meta_llm_content
|
|
|
|
return M
|