Browse Source

[Feature] PDF: Add timeouts for expensive operations

pull/3379/head
Vsevolod Stakhov 5 years ago
parent
commit
3b3de857f1
  1. 5
      conf/scores.d/content_group.conf
  2. 54
      lualib/lua_content/pdf.lua
  3. 10
      rules/content.lua
  4. 3
      src/lua/lua_task.c

5
conf/scores.d/content_group.conf

@ -43,5 +43,10 @@ symbols = {
description = "There is a PDF file with too many objects";
one_shot = true;
}
"PDF_TIMEOUT" {
weight = 0;
description = "There is a PDF file that caused timeout in processing";
one_shot = true;
}
}

54
lualib/lua_content/pdf.lua

@ -119,6 +119,7 @@ local config = {
max_pdf_objects = 10000, -- Maximum number of objects to be considered
max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
pdf_process_timeout = 1.0, -- Timeout in seconds for processing
}
-- Used to process patterns found in PDF
@ -809,7 +810,19 @@ end
-- PDF 1.5 ObjStmt
local function extract_pdf_compound_objects(task, pdf)
for _,obj in ipairs(pdf.objects or {}) do
for i,obj in ipairs(pdf.objects or {}) do
if i > 0 and i % 100 == 0 then
local now = rspamd_util.get_ticks()
if now >= pdf.end_timestamp then
pdf.timeout_processing = now - pdf.start_timestamp
lua_util.debugm(N, task, 'pdf: timeout processing compound objects after spending %s seconds, ' ..
'%s elements processed',
pdf.timeout_processing, i)
break
end
end
if obj.stream and obj.dict and type(obj.dict) == 'table' then
local t = obj.dict.Type
if t and t == 'ObjStm' then
@ -965,17 +978,47 @@ local function postprocess_pdf_objects(task, input, pdf)
-- Now we have objects and we need to attach streams that are in bounds
attach_pdf_streams(task, input, pdf)
-- Parse grammar for outer objects
for _,obj in ipairs(pdf.objects) do
for i,obj in ipairs(pdf.objects) do
if i > 0 and i % 100 == 0 then
local now = rspamd_util.get_ticks()
if now >= pdf.end_timestamp then
pdf.timeout_processing = now - pdf.start_timestamp
lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
'%s elements processed',
pdf.timeout_processing, i)
break
end
end
if obj.ref then
parse_object_grammar(obj, task, pdf)
end
end
extract_pdf_compound_objects(task, pdf)
if not pdf.timeout_processing then
extract_pdf_compound_objects(task, pdf)
else
-- ENOTIME
return
end
-- Now we might probably have all objects being processed
for _,obj in ipairs(pdf.objects) do
for i,obj in ipairs(pdf.objects) do
if obj.dict then
-- Types processing
if i > 0 and i % 100 == 0 then
local now = rspamd_util.get_ticks()
if now >= pdf.end_timestamp then
pdf.timeout_processing = now - pdf.start_timestamp
lua_util.debugm(N, task, 'pdf: timeout processing dicts after spending %s seconds, ' ..
'%s elements processed',
pdf.timeout_processing, i)
break
end
end
process_dict(task, pdf, obj, obj.dict)
end
end
@ -1112,9 +1155,12 @@ local function process_pdf(input, mpart, task)
local matches = pdf_trie:match(input)
if matches then
local start_ts = rspamd_util.get_ticks()
local pdf_output = {
tag = 'pdf',
extract_text = extract_text_data,
start_timestamp = start_ts,
end_timestamp = start_ts + config.pdf_process_timeout,
}
local grouped_processors = {}
for npat,matched_positions in pairs(matches) do

10
rules/content.lua

@ -46,6 +46,10 @@ local function process_pdf_specific(task, part, specific)
task:insert_result('PDF_MANY_OBJECTS', 1.0, string.format('%s:%d',
part:get_filename() or 'unknown', specific.many_objects))
end
if specific.timeout_processing then
task:insert_result('PDF_TIMEOUT', 1.0, string.format('%s:%.3f',
part:get_filename() or 'unknown', specific.timeout_processing))
end
end
local tags_processors = {
@ -104,3 +108,9 @@ rspamd_config:register_symbol{
parent = id,
groups = {"content", "pdf"},
}
rspamd_config:register_symbol{
type = 'virtual',
name = 'PDF_TIMEOUT',
parent = id,
groups = {"content", "pdf"},
}

3
src/lua/lua_task.c

@ -759,8 +759,9 @@ LUA_FUNCTION_DEF (task, get_date);
*/
LUA_FUNCTION_DEF (task, get_message_id);
/***
* @method task:get_timeval()
* @method task:get_timeval([raw])
* Returns the timestamp for a task start processing time.
* @param {boolean} raw if true then two float numbers are returned: task start timestamp and timeout event timestamp
* @return {table} table with fields as described in `struct timeval` in C
*/
LUA_FUNCTION_DEF (task, get_timeval);

Loading…
Cancel
Save