|
|
|
@ -30,438 +30,428 @@ my $man; |
|
|
|
my $help; |
|
|
|
|
|
|
|
GetOptions( |
|
|
|
"spam|s=s" => \$spam_dir, |
|
|
|
"ham|h=s" => \$ham_dir, |
|
|
|
"spam-symbol=s" => \$spam_symbol, |
|
|
|
"ham-symbol=s" => \$ham_symbol, |
|
|
|
"classifier|c=s" => \$classifier, |
|
|
|
"timeout|t=f" => \$timeout, |
|
|
|
"parallel|p=i" => \$parallel, |
|
|
|
"train-fraction|t=f" => \$train_fraction, |
|
|
|
"bogofilter|b" => \$use_bogofilter, |
|
|
|
"dspam|d" => \$use_dspam, |
|
|
|
"check-only" => \$check_only, |
|
|
|
"help|?" => \$help, |
|
|
|
"man" => \$man |
|
|
|
"spam|s=s" => \$spam_dir, |
|
|
|
"ham|h=s" => \$ham_dir, |
|
|
|
"spam-symbol=s" => \$spam_symbol, |
|
|
|
"ham-symbol=s" => \$ham_symbol, |
|
|
|
"classifier|c=s" => \$classifier, |
|
|
|
"timeout|t=f" => \$timeout, |
|
|
|
"parallel|p=i" => \$parallel, |
|
|
|
"train-fraction|t=f" => \$train_fraction, |
|
|
|
"bogofilter|b" => \$use_bogofilter, |
|
|
|
"dspam|d" => \$use_dspam, |
|
|
|
"check-only" => \$check_only, |
|
|
|
"help|?" => \$help, |
|
|
|
"man" => \$man |
|
|
|
) or pod2usage(2); |
|
|
|
|
|
|
|
pod2usage(1) if $help; |
|
|
|
pod2usage( -exitval => 0, -verbose => 2 ) if $man; |
|
|
|
|
|
|
|
sub read_dir_files { |
|
|
|
my ( $dir, $target ) = @_; |
|
|
|
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; |
|
|
|
while ( my $file = readdir $dh ) { |
|
|
|
if ( -f "$dir/$file" ) { |
|
|
|
push @{$target}, "$dir/$file"; |
|
|
|
my ( $dir, $target ) = @_; |
|
|
|
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; |
|
|
|
while ( my $file = readdir $dh ) { |
|
|
|
if ( -f "$dir/$file" ) { |
|
|
|
push @{$target}, "$dir/$file"; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
sub shuffle_array { |
|
|
|
my ($ar) = @_; |
|
|
|
my ($ar) = @_; |
|
|
|
|
|
|
|
for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { |
|
|
|
if ( $i > 1 ) { |
|
|
|
my $sel = int( rand( $i - 1 ) ); |
|
|
|
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); |
|
|
|
for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { |
|
|
|
if ( $i > 1 ) { |
|
|
|
my $sel = int( rand( $i - 1 ) ); |
|
|
|
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
sub learn_rspamc { |
|
|
|
my ( $files, $spam ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
my $cmd = $spam ? "learn_spam" : "learn_ham"; |
|
|
|
my $args_quoted = shell_quote @{$files}; |
|
|
|
open( |
|
|
|
my $p, |
|
|
|
"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" |
|
|
|
) or die "cannot spawn $rspamc: $!"; |
|
|
|
|
|
|
|
while (<$p>) { |
|
|
|
my $res = eval('decode_json($_)'); |
|
|
|
if ( $res && $res->{'success'} ) { |
|
|
|
$processed++; |
|
|
|
my ( $files, $spam ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
my $cmd = $spam ? "learn_spam" : "learn_ham"; |
|
|
|
my $args_quoted = shell_quote @{$files}; |
|
|
|
open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) |
|
|
|
or die "cannot spawn $rspamc: $!"; |
|
|
|
|
|
|
|
while (<$p>) { |
|
|
|
my $res = eval('decode_json($_)'); |
|
|
|
if ( $res && $res->{'success'} ) { |
|
|
|
$processed++; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
sub learn_bogofilter { |
|
|
|
my ( $files, $spam ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
my $fl = $spam ? "-s" : "-n"; |
|
|
|
`$bogofilter -I $args_quoted $fl`; |
|
|
|
if ( $? == 0 ) { |
|
|
|
$processed++; |
|
|
|
my ( $files, $spam ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
my $fl = $spam ? "-s" : "-n"; |
|
|
|
`$bogofilter -I $args_quoted $fl`; |
|
|
|
if ( $? == 0 ) { |
|
|
|
$processed++; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
sub learn_dspam { |
|
|
|
my ( $files, $spam ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
my $fl = $spam ? "--class=spam" : "--class=innocent"; |
|
|
|
open( my $p, |
|
|
|
"|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) |
|
|
|
or die "cannot run $dspam: $!"; |
|
|
|
|
|
|
|
open( my $inp, "< $f" ); |
|
|
|
while (<$inp>) { |
|
|
|
print $p $_; |
|
|
|
my ( $files, $spam ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
my $fl = $spam ? "--class=spam" : "--class=innocent"; |
|
|
|
open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) |
|
|
|
or die "cannot run $dspam: $!"; |
|
|
|
|
|
|
|
open( my $inp, "< $f" ); |
|
|
|
while (<$inp>) { |
|
|
|
print $p $_; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
sub learn_samples { |
|
|
|
my ( $ar_ham, $ar_spam ) = @_; |
|
|
|
my $len; |
|
|
|
my $processed = 0; |
|
|
|
my $total = 0; |
|
|
|
my $learn_func; |
|
|
|
|
|
|
|
my @files_spam; |
|
|
|
my @files_ham; |
|
|
|
|
|
|
|
if ($use_dspam) { |
|
|
|
$learn_func = \&learn_dspam; |
|
|
|
} |
|
|
|
elsif ($use_bogofilter) { |
|
|
|
$learn_func = \&learn_bogofilter; |
|
|
|
} |
|
|
|
else { |
|
|
|
$learn_func = \&learn_rspamc; |
|
|
|
} |
|
|
|
|
|
|
|
$len = int( scalar @{$ar_ham} * $train_fraction ); |
|
|
|
my @cur_vec; |
|
|
|
|
|
|
|
# Shuffle spam and ham samples |
|
|
|
for ( my $i = 0 ; $i < $len ; $i++ ) { |
|
|
|
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { |
|
|
|
push @cur_vec, @{$ar_ham}[$i]; |
|
|
|
push @files_ham, [@cur_vec]; |
|
|
|
@cur_vec = (); |
|
|
|
$total++; |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_vec, @{$ar_ham}[$i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$len = int( scalar @{$ar_spam} * $train_fraction ); |
|
|
|
@cur_vec = (); |
|
|
|
for ( my $i = 0 ; $i < $len ; $i++ ) { |
|
|
|
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { |
|
|
|
push @cur_vec, @{$ar_spam}[$i]; |
|
|
|
push @files_spam, [@cur_vec]; |
|
|
|
@cur_vec = (); |
|
|
|
$total++; |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_vec, @{$ar_spam}[$i]; |
|
|
|
my ( $ar_ham, $ar_spam ) = @_; |
|
|
|
my $len; |
|
|
|
my $processed = 0; |
|
|
|
my $total = 0; |
|
|
|
my $learn_func; |
|
|
|
|
|
|
|
my @files_spam; |
|
|
|
my @files_ham; |
|
|
|
|
|
|
|
if ($use_dspam) { |
|
|
|
$learn_func = \&learn_dspam; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
for ( my $i = 0 ; $i < $total ; $i++ ) { |
|
|
|
my $args; |
|
|
|
my $spam; |
|
|
|
|
|
|
|
if ( $i % 2 == 0 ) { |
|
|
|
$args = pop @files_spam; |
|
|
|
|
|
|
|
if ( !$args ) { |
|
|
|
$args = pop @files_ham; |
|
|
|
$spam = 0; |
|
|
|
} |
|
|
|
else { |
|
|
|
$spam = 1; |
|
|
|
} |
|
|
|
elsif ($use_bogofilter) { |
|
|
|
$learn_func = \&learn_bogofilter; |
|
|
|
} |
|
|
|
else { |
|
|
|
$args = pop @files_ham; |
|
|
|
if ( !$args ) { |
|
|
|
$args = pop @files_spam; |
|
|
|
$spam = 1; |
|
|
|
} |
|
|
|
else { |
|
|
|
$spam = 0; |
|
|
|
} |
|
|
|
$learn_func = \&learn_rspamc; |
|
|
|
} |
|
|
|
|
|
|
|
my $r = $learn_func->( $args, $spam ); |
|
|
|
if ($r) { |
|
|
|
$processed += $r; |
|
|
|
$len = int( scalar @{$ar_ham} * $train_fraction ); |
|
|
|
my @cur_vec; |
|
|
|
|
|
|
|
# Shuffle spam and ham samples |
|
|
|
for ( my $i = 0 ; $i < $len ; $i++ ) { |
|
|
|
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { |
|
|
|
push @cur_vec, @{$ar_ham}[$i]; |
|
|
|
push @files_ham, [@cur_vec]; |
|
|
|
@cur_vec = (); |
|
|
|
$total++; |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_vec, @{$ar_ham}[$i]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
} |
|
|
|
$len = int( scalar @{$ar_spam} * $train_fraction ); |
|
|
|
@cur_vec = (); |
|
|
|
for ( my $i = 0 ; $i < $len ; $i++ ) { |
|
|
|
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { |
|
|
|
push @cur_vec, @{$ar_spam}[$i]; |
|
|
|
push @files_spam, [@cur_vec]; |
|
|
|
@cur_vec = (); |
|
|
|
$total++; |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_vec, @{$ar_spam}[$i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
sub check_rspamc { |
|
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; |
|
|
|
for ( my $i = 0 ; $i < $total ; $i++ ) { |
|
|
|
my $args; |
|
|
|
my $spam; |
|
|
|
|
|
|
|
my $args_quoted = shell_quote @{$files}; |
|
|
|
my $processed = 0; |
|
|
|
if ( $i % 2 == 0 ) { |
|
|
|
$args = pop @files_spam; |
|
|
|
|
|
|
|
open( |
|
|
|
my $p, |
|
|
|
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" |
|
|
|
) or die "cannot spawn $rspamc: $!"; |
|
|
|
|
|
|
|
while (<$p>) { |
|
|
|
my $res = eval('decode_json($_)'); |
|
|
|
if ( $res && $res->{'default'} ) { |
|
|
|
$processed++; |
|
|
|
|
|
|
|
if ($spam) { |
|
|
|
if ( $res->{'default'}->{$ham_symbol} ) { |
|
|
|
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; |
|
|
|
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { |
|
|
|
my $percentage = int($1); |
|
|
|
if ( $percentage >= $rspamc_prob_trigger ) { |
|
|
|
$$fp_cnt++; |
|
|
|
if ( !$args ) { |
|
|
|
$args = pop @files_ham; |
|
|
|
$spam = 0; |
|
|
|
} |
|
|
|
else { |
|
|
|
$spam = 1; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( !$res->{'default'}->{$spam_symbol} ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( $res->{'default'}->{$spam_symbol} ) { |
|
|
|
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; |
|
|
|
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { |
|
|
|
|
|
|
|
my $percentage = int($1); |
|
|
|
if ( $percentage >= $rspamc_prob_trigger ) { |
|
|
|
$$fp_cnt++; |
|
|
|
$args = pop @files_ham; |
|
|
|
if ( !$args ) { |
|
|
|
$args = pop @files_spam; |
|
|
|
$spam = 1; |
|
|
|
} |
|
|
|
else { |
|
|
|
$spam = 0; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( !$res->{'default'}->{$ham_symbol} ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
|
|
|
|
my $r = $learn_func->( $args, $spam ); |
|
|
|
if ($r) { |
|
|
|
$processed += $r; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
sub check_bogofilter { |
|
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
sub check_rspamc { |
|
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
my $args_quoted = shell_quote @{$files}; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
open( my $p, "$bogofilter -t -I $args_quoted |" ) |
|
|
|
or die "cannot spawn $bogofilter: $!"; |
|
|
|
open( |
|
|
|
my $p, |
|
|
|
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" |
|
|
|
) or die "cannot spawn $rspamc: $!"; |
|
|
|
|
|
|
|
while (<$p>) { |
|
|
|
if ( $_ =~ /^([SHU])\s+.*$/ ) { |
|
|
|
$processed++; |
|
|
|
|
|
|
|
if ($spam) { |
|
|
|
if ( $1 eq 'H' ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
elsif ( $1 eq 'U' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( $1 eq 'S' ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
elsif ( $1 eq 'U' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
my $res = eval('decode_json($_)'); |
|
|
|
if ( $res && $res->{'default'} ) { |
|
|
|
$processed++; |
|
|
|
|
|
|
|
if ($spam) { |
|
|
|
if ( $res->{'default'}->{$ham_symbol} ) { |
|
|
|
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; |
|
|
|
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { |
|
|
|
my $percentage = int($1); |
|
|
|
if ( $percentage >= $rspamc_prob_trigger ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( !$res->{'default'}->{$spam_symbol} ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( $res->{'default'}->{$spam_symbol} ) { |
|
|
|
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; |
|
|
|
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { |
|
|
|
|
|
|
|
my $percentage = int($1); |
|
|
|
if ( $percentage >= $rspamc_prob_trigger ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( !$res->{'default'}->{$ham_symbol} ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
sub check_dspam { |
|
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
sub check_bogofilter { |
|
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
|
|
|
|
open( my $p, "$bogofilter -t -I $args_quoted |" ) |
|
|
|
or die "cannot spawn $bogofilter: $!"; |
|
|
|
|
|
|
|
while (<$p>) { |
|
|
|
if ( $_ =~ /^([SHU])\s+.*$/ ) { |
|
|
|
$processed++; |
|
|
|
|
|
|
|
if ($spam) { |
|
|
|
if ( $1 eq 'H' ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
elsif ( $1 eq 'U' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( $1 eq 'S' ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
elsif ( $1 eq 'U' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
my $pid = open2( *Reader, *Writer, |
|
|
|
"$dspam --user nobody --classify --stdout --mode=notrain" ); |
|
|
|
open( my $inp, "< $f" ); |
|
|
|
while (<$inp>) { |
|
|
|
print Writer $_; |
|
|
|
} |
|
|
|
close Writer; |
|
|
|
sub check_dspam { |
|
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; |
|
|
|
my $processed = 0; |
|
|
|
|
|
|
|
while (<Reader>) { |
|
|
|
if ( $_ =~ |
|
|
|
qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) |
|
|
|
) |
|
|
|
{ |
|
|
|
$processed++; |
|
|
|
my $percentage = int($2 * 100.0); |
|
|
|
foreach my $f ( @{$files} ) { |
|
|
|
my $args_quoted = shell_quote $f; |
|
|
|
|
|
|
|
if ($spam) { |
|
|
|
if ( $1 eq 'Innocent') { |
|
|
|
if ( $percentage <= (100 - $rspamc_prob_trigger) ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( $1 ne 'Spam' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); |
|
|
|
open( my $inp, "< $f" ); |
|
|
|
while (<$inp>) { |
|
|
|
print Writer $_; |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( $1 eq 'Spam' ) { |
|
|
|
if ( $percentage >= $rspamc_prob_trigger ) { |
|
|
|
$$fp_cnt++; |
|
|
|
close Writer; |
|
|
|
|
|
|
|
while (<Reader>) { |
|
|
|
if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { |
|
|
|
$processed++; |
|
|
|
my $percentage = int( $2 * 100.0 ); |
|
|
|
|
|
|
|
if ($spam) { |
|
|
|
if ( $1 eq 'Innocent' ) { |
|
|
|
if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( $1 ne 'Spam' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( $1 eq 'Spam' ) { |
|
|
|
if ( $percentage >= $rspamc_prob_trigger ) { |
|
|
|
$$fp_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( $1 ne 'Innocent' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( $1 ne 'Innocent' ) { |
|
|
|
$$fn_cnt++; |
|
|
|
} |
|
|
|
else { |
|
|
|
$$detected_cnt++; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
close Reader; |
|
|
|
waitpid( $pid, 0 ); |
|
|
|
} |
|
|
|
close Reader; |
|
|
|
waitpid( $pid, 0 ); |
|
|
|
} |
|
|
|
|
|
|
|
return $processed; |
|
|
|
return $processed; |
|
|
|
} |
|
|
|
|
|
|
|
sub cross_validate { |
|
|
|
my ($hr) = @_; |
|
|
|
my $args = ""; |
|
|
|
my $processed = 0; |
|
|
|
my $fp_spam = 0; |
|
|
|
my $fn_spam = 0; |
|
|
|
my $fp_ham = 0; |
|
|
|
my $fn_ham = 0; |
|
|
|
my $total_spam = 0; |
|
|
|
my $total_ham = 0; |
|
|
|
my $detected_spam = 0; |
|
|
|
my $detected_ham = 0; |
|
|
|
my $i = 0; |
|
|
|
my $len = scalar keys %{$hr}; |
|
|
|
my @files_spam; |
|
|
|
my @files_ham; |
|
|
|
my @cur_spam; |
|
|
|
my @cur_ham; |
|
|
|
my $check_func; |
|
|
|
|
|
|
|
if ($use_dspam) { |
|
|
|
$check_func = \&check_dspam; |
|
|
|
} |
|
|
|
elsif ($use_bogofilter) { |
|
|
|
$check_func = \&check_bogofilter; |
|
|
|
} |
|
|
|
else { |
|
|
|
$check_func = \&check_rspamc; |
|
|
|
} |
|
|
|
|
|
|
|
while ( my ( $fn, $spam ) = each( %{$hr} ) ) { |
|
|
|
if ($spam) { |
|
|
|
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { |
|
|
|
push @cur_spam, $fn; |
|
|
|
push @files_spam, [@cur_spam]; |
|
|
|
@cur_spam = (); |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_spam, $fn; |
|
|
|
} |
|
|
|
my ($hr) = @_; |
|
|
|
my $args = ""; |
|
|
|
my $processed = 0; |
|
|
|
my $fp_spam = 0; |
|
|
|
my $fn_spam = 0; |
|
|
|
my $fp_ham = 0; |
|
|
|
my $fn_ham = 0; |
|
|
|
my $total_spam = 0; |
|
|
|
my $total_ham = 0; |
|
|
|
my $detected_spam = 0; |
|
|
|
my $detected_ham = 0; |
|
|
|
my $i = 0; |
|
|
|
my $len = scalar keys %{$hr}; |
|
|
|
my @files_spam; |
|
|
|
my @files_ham; |
|
|
|
my @cur_spam; |
|
|
|
my @cur_ham; |
|
|
|
my $check_func; |
|
|
|
|
|
|
|
if ($use_dspam) { |
|
|
|
$check_func = \&check_dspam; |
|
|
|
} |
|
|
|
elsif ($use_bogofilter) { |
|
|
|
$check_func = \&check_bogofilter; |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { |
|
|
|
push @cur_ham, $fn; |
|
|
|
push @files_ham, [@cur_ham]; |
|
|
|
@cur_ham = (); |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_ham, $fn; |
|
|
|
} |
|
|
|
$check_func = \&check_rspamc; |
|
|
|
} |
|
|
|
|
|
|
|
while ( my ( $fn, $spam ) = each( %{$hr} ) ) { |
|
|
|
if ($spam) { |
|
|
|
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { |
|
|
|
push @cur_spam, $fn; |
|
|
|
push @files_spam, [@cur_spam]; |
|
|
|
@cur_spam = (); |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_spam, $fn; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { |
|
|
|
push @cur_ham, $fn; |
|
|
|
push @files_ham, [@cur_ham]; |
|
|
|
@cur_ham = (); |
|
|
|
} |
|
|
|
else { |
|
|
|
push @cur_ham, $fn; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
shuffle_array( \@files_spam ); |
|
|
|
shuffle_array( \@files_spam ); |
|
|
|
|
|
|
|
foreach my $fn (@files_spam) { |
|
|
|
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); |
|
|
|
$total_spam += $r; |
|
|
|
$processed += $r; |
|
|
|
} |
|
|
|
foreach my $fn (@files_spam) { |
|
|
|
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); |
|
|
|
$total_spam += $r; |
|
|
|
$processed += $r; |
|
|
|
} |
|
|
|
|
|
|
|
shuffle_array( \@files_ham ); |
|
|
|
shuffle_array( \@files_ham ); |
|
|
|
|
|
|
|
foreach my $fn (@files_ham) { |
|
|
|
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); |
|
|
|
$total_ham += $r; |
|
|
|
$processed += $r; |
|
|
|
} |
|
|
|
foreach my $fn (@files_ham) { |
|
|
|
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); |
|
|
|
$total_ham += $r; |
|
|
|
$processed += $r; |
|
|
|
} |
|
|
|
|
|
|
|
printf "Scanned %d messages |
|
|
|
printf "Scanned %d messages |
|
|
|
%d spam messages (%d detected) |
|
|
|
%d ham messages (%d detected)\n", |
|
|
|
$processed, $total_spam, $detected_spam, $total_ham, $detected_ham; |
|
|
|
%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; |
|
|
|
|
|
|
|
printf "\nHam FP rate: %.2f%% (%d messages) |
|
|
|
Ham FN rate: %.2f%% (%d messages)\n", |
|
|
|
$fp_ham / $total_ham * 100.0, $fp_ham, |
|
|
|
$fn_ham / $total_ham * 100.0, $fn_ham; |
|
|
|
printf "\nHam FP rate: %.2f%% (%d messages) |
|
|
|
Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; |
|
|
|
|
|
|
|
printf "\nSpam FP rate: %.2f%% (%d messages) |
|
|
|
printf "\nSpam FP rate: %.2f%% (%d messages) |
|
|
|
Spam FN rate: %.2f%% (%d messages)\n", |
|
|
|
$fp_spam / $total_spam * 100.0, $fp_spam, |
|
|
|
$fn_spam / $total_spam * 100.0, $fn_spam; |
|
|
|
$fp_spam / $total_spam * 100.0, $fp_spam, |
|
|
|
$fn_spam / $total_spam * 100.0, $fn_spam; |
|
|
|
} |
|
|
|
|
|
|
|
if ( !$spam_dir || !$ham_dir ) { |
|
|
|
die "spam or/and ham directories are not specified"; |
|
|
|
die "spam or/and ham directories are not specified"; |
|
|
|
} |
|
|
|
|
|
|
|
my @spam_samples; |
|
|
|
@ -473,24 +463,23 @@ shuffle_array( \@spam_samples ); |
|
|
|
shuffle_array( \@ham_samples ); |
|
|
|
|
|
|
|
if ( !$check_only ) { |
|
|
|
my $learned = 0; |
|
|
|
my $t0 = [gettimeofday]; |
|
|
|
$learned = learn_samples( \@ham_samples, \@spam_samples ); |
|
|
|
my $t1 = [gettimeofday]; |
|
|
|
my $learned = 0; |
|
|
|
my $t0 = [gettimeofday]; |
|
|
|
$learned = learn_samples( \@ham_samples, \@spam_samples ); |
|
|
|
my $t1 = [gettimeofday]; |
|
|
|
|
|
|
|
printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", |
|
|
|
$learned, tv_interval( $t0, $t1 ); |
|
|
|
printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); |
|
|
|
} |
|
|
|
|
|
|
|
my %validation_set; |
|
|
|
my $len = int( scalar @spam_samples * $train_fraction ); |
|
|
|
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { |
|
|
|
$validation_set{ $spam_samples[$i] } = 1; |
|
|
|
$validation_set{ $spam_samples[$i] } = 1; |
|
|
|
} |
|
|
|
|
|
|
|
$len = int( scalar @ham_samples * $train_fraction ); |
|
|
|
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { |
|
|
|
$validation_set{ $ham_samples[$i] } = 0; |
|
|
|
$validation_set{ $ham_samples[$i] } = 0; |
|
|
|
} |
|
|
|
|
|
|
|
cross_validate( \%validation_set ); |
|
|
|
|