Rapid spam filtering system https://rspamd.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

76 lines
2.3 KiB

--[[
Common helpers for building LLM input content from a task
]] --
local lua_util = require "lua_util"
local lua_mime = require "lua_mime"
local fun = require "fun"
local M = {}
local N = 'llm_common'
local function get_meta_llm_content(task)
local url_content = "Url domains: no urls found"
if task:has_urls() then
local urls = lua_util.extract_specific_urls { task = task, limit = 5, esld_limit = 1 }
url_content = "Url domains: " .. table.concat(fun.totable(fun.map(function(u)
return u:get_tld() or ''
end, urls or {})), ', ')
end
local from_or_empty = ((task:get_from('mime') or {})[1] or {})
local from_name = from_or_empty.name or ''
local from_addr = from_or_empty.addr or ''
local from_content = string.format('From: %s <%s>', from_name, from_addr)
return url_content, from_content
end
-- Build structured payload suitable for LLM embeddings and chat
-- Returns: table { subject = <string>, from = <string>, url_domains = <string>, text = <rspamd_text|string> }, part
function M.build_llm_input(task, opts)
opts = opts or {}
local subject = task:get_subject() or ''
local url_content, from_content = get_meta_llm_content(task)
local sel_part = lua_mime.get_displayed_text_part(task)
if not sel_part then
lua_util.debugm(N, task, 'no displayed text part found')
return nil, nil
end
local nwords = sel_part:get_words_count() or 0
if nwords < 5 then
lua_util.debugm(N, task, 'too few words in part: %s', nwords)
return nil, sel_part
end
local max_tokens = tonumber(opts.max_tokens) or 1024
local text
if nwords > max_tokens then
local words = sel_part:get_words('norm') or {}
if #words > max_tokens then
text = table.concat(words, ' ', 1, max_tokens)
else
text = table.concat(words, ' ')
end
lua_util.debugm(N, task, 'truncated text to %s tokens (had %s words)', max_tokens, nwords)
else
-- Keep rspamd_text (userdata) intact; consumers (http/ucl) can use it directly
text = sel_part:get_content_oneline() or ''
end
return {
subject = subject,
from = from_content,
url_domains = url_content,
text = text,
}, sel_part
end
-- Backwards-compat alias
M.build_embedding_input = M.build_llm_input
M.get_meta_llm_content = get_meta_llm_content
return M