diff -uNr Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/Conf/Parser.pm Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/Conf/Parser.pm --- Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/Conf/Parser.pm 2007-02-14 03:17:13.000000000 +0900 +++ Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/Conf/Parser.pm 2007-02-15 20:58:39.000000000 +0900 @@ -698,6 +698,9 @@ # We don't do priorities for $Mail::SpamAssassin::Conf::TYPE_RBL_EVALS $conf->{rbl_evals}->{$name} = \@args; } + elsif ($type == $Mail::SpamAssassin::Conf::TYPE_NBODY_EVALS) { + $conf->{nbody_evals}->{$priority}->{$name} = \@args if ($conf->{normalize_charset}); + } elsif ($type == $Mail::SpamAssassin::Conf::TYPE_RAWBODY_EVALS) { $conf->{rawbody_evals}->{$priority}->{$name} = \@args; } @@ -740,6 +743,9 @@ elsif ($type == $Mail::SpamAssassin::Conf::TYPE_URI_TESTS) { $conf->{uri_tests}->{$priority}->{$name} = $text; } + elsif ($type == $Mail::SpamAssassin::Conf::TYPE_NBODY_TESTS and $conf->{normalize_charset}) { + $conf->{nbody_tests}->{$priority}->{$name} = Encode::decode_utf8($text) unless (Encode::is_utf8($text)); + } elsif ($type == $Mail::SpamAssassin::Conf::TYPE_RAWBODY_TESTS) { $conf->{rawbody_tests}->{$priority}->{$name} = $text; } @@ -839,6 +845,7 @@ # all of these rule types are regexps if ($type == $Mail::SpamAssassin::Conf::TYPE_BODY_TESTS || + $type == $Mail::SpamAssassin::Conf::TYPE_NBODY_TESTS || $type == $Mail::SpamAssassin::Conf::TYPE_FULL_TESTS || $type == $Mail::SpamAssassin::Conf::TYPE_RAWBODY_TESTS || $type == $Mail::SpamAssassin::Conf::TYPE_URI_TESTS) diff -uNr Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/Conf.pm Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/Conf.pm --- Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/Conf.pm 2007-02-14 03:17:13.000000000 +0900 +++ Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/Conf.pm 2007-02-15 20:58:39.000000000 +0900 @@ -91,7 +91,7 @@ $TYPE_HEAD_TESTS $TYPE_HEAD_EVALS $TYPE_BODY_TESTS $TYPE_BODY_EVALS $TYPE_FULL_TESTS $TYPE_FULL_EVALS $TYPE_RAWBODY_TESTS $TYPE_RAWBODY_EVALS $TYPE_URI_TESTS $TYPE_URI_EVALS -$TYPE_META_TESTS $TYPE_RBL_EVALS +$TYPE_META_TESTS $TYPE_RBL_EVALS $TYPE_NBODY_TESTS $TYPE_NBODY_EVALS }; @ISA = qw(); @@ -110,11 +110,13 @@ $TYPE_URI_EVALS = 0x0011; $TYPE_META_TESTS = 0x0012; $TYPE_RBL_EVALS = 0x0013; +$TYPE_NBODY_TESTS = 0x0014; +$TYPE_NBODY_EVALS = 0x0015; my @rule_types = ("body_tests", "uri_tests", "uri_evals", "head_tests", "head_evals", "body_evals", "full_tests", "full_evals", "rawbody_tests", "rawbody_evals", - "rbl_evals", "meta_tests"); + "rbl_evals", "meta_tests", "nbody_tests", "nbody_evals"); $VERSION = 'bogus'; # avoid CPAN.pm picking up version strings later @@ -839,6 +841,18 @@ type => $CONF_TYPE_STRING }); +=item normalize_charset ( 0 | 1) (default: 0) + +If you set this option, messages are checked after UTF-8 encoding conversion. + +=cut + + push (@cmds, { + setting => 'normalize_charset', + default => 0, + type => $CONF_TYPE_BOOL + }); + =back =head2 NETWORK TEST OPTIONS @@ -1951,6 +1965,45 @@ } }); +=item nbody SYMBOLIC_TEST_NAME /pattern/modifiers + +Define a nbody pattern test. C is a Perl regular expression. Note: +as per the header tests, C<#> must be escaped (C<\#>) or else it is considered +the beginning of a comment. + +The 'nbody' in this case is the utf-8 normalized textual parts of the +message body; +any non-text MIME parts are stripped, and the message decoded from +Quoted-Printable or Base-64-encoded format if necessary. The message +Subject header is considered part of the nbody and becomes the first +paragraph when running the rules. All HTML tags and line breaks will +be removed before matching. + +=item nbody SYMBOLIC_TEST_NAME eval:name_of_eval_method([args]) + +Define a nbody eval test. See above. + +=cut + + push (@cmds, { + setting => 'nbody', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + if ($value =~ /^(\S+)\s+eval:(.*)$/) { + $self->{parser}->add_test ($1, $2, $TYPE_NBODY_EVALS); + } + else { + my @values = split(/\s+/, $value, 2); + if (@values != 2) { + return $MISSING_REQUIRED_VALUE; + } + $self->{parser}->add_test (@values, $TYPE_NBODY_TESTS); + } + } + }); + =item uri SYMBOLIC_TEST_NAME /pattern/modifiers Define a uri pattern test. C is a Perl regular expression. Note: as @@ -2803,6 +2856,8 @@ $self->{rawbody_evals} = { }; $self->{meta_tests} = { }; $self->{eval_plugins} = { }; + $self->{nbody_tests} = { }; + $self->{nbody_evals} = { }; # testing stuff $self->{regression_tests} = { }; @@ -3103,6 +3158,7 @@ return 0 if (!defined ($type)); if (($type == $TYPE_BODY_TESTS) || ($type == $TYPE_BODY_EVALS) + || ($type == $TYPE_NBODY_TESTS) || ($type == $TYPE_NBODY_EVALS) || ($type == $TYPE_URI_TESTS) || ($type == $TYPE_URI_EVALS)) { # some rawbody go off of headers... diff -uNr Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/PerMsgStatus.pm Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/PerMsgStatus.pm --- Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/PerMsgStatus.pm 2007-02-14 03:17:13.000000000 +0900 +++ Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/PerMsgStatus.pm 2007-02-15 20:58:39.000000000 +0900 @@ -85,6 +85,7 @@ 'disable_auto_learning' => 0, 'auto_learn_status' => undef, 'conf' => $main->{conf}, + 'normalize_charset' => $main->{conf}->{normalize_charset}, }; if (defined $opts && $opts->{disable_auto_learning}) { @@ -160,6 +161,7 @@ my $needs_dnsbl_harvest_p = 1; # harvest needs to be run my $decoded = $self->get_decoded_stripped_body_text_array(); + my $normalized = $self->get_normalized_body_text_array(); my $bodytext = $self->get_decoded_body_text_array(); my $fulltext = $self->{msg}->get_pristine(); @@ -199,6 +201,11 @@ $self->do_body_tests($priority, $decoded); $self->do_body_uri_tests($priority, @uris); $self->do_body_eval_tests($priority, $decoded); + + if ($self->{normalize_charset}) { + $self->do_nbody_tests($priority, $normalized); + $self->do_nbody_eval_tests($priority, $normalized); + } $self->do_rawbody_tests($priority, $bodytext); $self->do_rawbody_eval_tests($priority, $bodytext); @@ -226,6 +233,7 @@ # finished running rules delete $self->{current_rule_name}; undef $decoded; + undef $normalized; undef $bodytext; undef $fulltext; @@ -599,7 +607,13 @@ my ($self) = @_; my $str = ''; - my $ary = $self->get_decoded_stripped_body_text_array(); + my $ary; + if ($self->{'normalize_charset'}) { + $ary = $self->get_normalized_body_text_array(); + } + else { + $ary = $self->get_decoded_stripped_body_text_array(); + } shift @{$ary}; # drop the subject line my $numlines = 3; @@ -829,6 +843,17 @@ my $description = $self->{main}->{'encapsulated_content_description'}; + if ($self->{normalize_charset}) { + my $cs = 'utf8'; + if ($self->{conf}->{report_charset}) { + $cs = $self->{conf}->{report_charset}; + Encode::from_to($report, 'utf8', $cs); + } + else { + $report_charset = "; charset=UTF-8"; + } + } + # Note: the message should end in blank line since mbox format wants # blank line at end and messages may be concatenated! In addition, the # x-spam-type parameter is fixed since we will use it later to recognize @@ -962,8 +987,11 @@ return $text unless ($text =~ /[\x80-\xff]/); my $cs = 'ISO-8859-1'; - if ($self->{report_charset}) { - $cs = $self->{report_charset}; + if ($self->{conf}->{report_charset}) { + $cs = $self->{conf}->{report_charset}; + } + if ($self->{normalize_charset}) { + Encode::from_to($text, 'utf8', $cs); } my @hexchars = split('', '0123456789abcdef'); @@ -1316,6 +1344,9 @@ if (defined &{'_body_uri_tests_'.$clean_priority}) { undef &{'_body_uri_tests_'.$clean_priority}; } + if (defined &{'_nbody_tests_'.$clean_priority}) { + undef &{'_nbody_tests_'.$clean_priority}; + } if (defined &{'_rawbody_tests_'.$clean_priority}) { undef &{'_rawbody_tests_'.$clean_priority}; } @@ -1387,6 +1418,10 @@ return $_[0]->{msg}->get_rendered_body_text_array(); } +sub get_normalized_body_text_array { + return $_[0]->{msg}->get_normalized_body_text_array(); +} + ########################################################################### =item $status->get (header_name [, default_value]) @@ -1477,12 +1512,16 @@ my $getaddr = 0; my $getname = 0; my $getraw = 0; + my $getutf8 = 0; # special queries if (index($request, ':') != -1) { $getaddr = ($request =~ s/:addr$//); $getname = ($request =~ s/:name$//); $getraw = ($request =~ s/:raw$//); + if ($self->{normalize_charset}) { + $getutf8 = ($request =~ s/:utf8$//); + } } # ALL: entire raw headers @@ -1511,26 +1550,26 @@ } # ToCc: the combined recipients list elsif ($request eq 'ToCc') { - $result = join("\n", $self->{msg}->get_header('To', $getraw)); + $result = join("\n", $self->{msg}->get_header('To', $getraw, $getutf8)); if ($result) { chomp $result; $result .= ", " if $result =~ /\S/; } - $result .= join("\n", $self->{msg}->get_header('Cc', $getraw)); + $result .= join("\n", $self->{msg}->get_header('Cc', $getraw, $getutf8)); $result = undef if !$result; } # MESSAGEID: handle lists which move the real message-id to another # header for resending. elsif ($request eq 'MESSAGEID') { $result = join("\n", grep { defined($_) && length($_) > 0 } - $self->{msg}->get_header('X-Message-Id', $getraw), - $self->{msg}->get_header('Resent-Message-Id', $getraw), - $self->{msg}->get_header('X-Original-Message-ID', $getraw), - $self->{msg}->get_header('Message-Id', $getraw)); + $self->{msg}->get_header('X-Message-Id', $getraw, $getutf8), + $self->{msg}->get_header('Resent-Message-Id', $getraw, $getutf8), + $self->{msg}->get_header('X-Original-Message-ID', $getraw, $getutf8), + $self->{msg}->get_header('Message-Id', $getraw, $getutf8)); } # a conventional header else { - $result = join('', $self->{msg}->get_header($request, $getraw)); + $result = join('', $self->{msg}->get_header($request, $getraw, $getutf8)); # metadata if (!$result) { @@ -1831,6 +1870,96 @@ } } +sub do_nbody_tests { + my ($self, $priority, $textary) = @_; + local ($_); + + dbg("rules: running nbody-text per-line regexp tests; score so far=".$self->{score}); + + my $doing_user_rules = + $self->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_NBODY_TESTS}; + + # clean up priority value so it can be used in a subroutine name + my $clean_priority; + ($clean_priority = $priority) =~ s/-/neg/; + + $self->{test_log_msgs} = (); # clear test state + if (defined &{'Mail::SpamAssassin::PerMsgStatus::_nbody_tests_'.$clean_priority} + && !$doing_user_rules) { + no strict "refs"; + &{'Mail::SpamAssassin::PerMsgStatus::_nbody_tests_'.$clean_priority}($self, @$textary); + use strict "refs"; + return; + } + + # build up the eval string... + my $evalstr = ''; + my $evalstr2 = ''; + + while (my($rulename, $pat) = each %{$self->{conf}{nbody_tests}->{$priority}}) { + $evalstr .= ' + if ($self->{conf}->{scores}->{q{'.$rulename.'}}) { + # call procedurally as it is faster. + '.$rulename.'_nbody_test($self,@_); + } + '; + + if ($doing_user_rules) { + next if (!$self->is_user_rule_sub ($rulename.'_nbody_test')); + } + + $evalstr2 .= ' + sub '.$rulename.'_nbody_test { + my $self = shift; + foreach (@_) { + '.$self->hash_line_for_rule($rulename).' + if ('.$pat.') { + $self->got_pattern_hit(q{'.$rulename.'}, "NBODY: "); + '. $self->ran_rule_debug_code($rulename, "nbody", 2) . ' + # Ok, we hit, stop now. + last; + } + } + } + '; + } + + # clear out a previous version of this fn, if already defined + if (defined &{'_nbody_tests_'.$clean_priority}) { + undef &{'_nbody_tests_'.$clean_priority}; + } + + return unless ($evalstr); + + # generate the loop that goes through each line... + $evalstr = <<"EOT"; +{ + package Mail::SpamAssassin::PerMsgStatus; + + $evalstr2; + + sub _nbody_tests_$clean_priority { + my \$self = shift; + $evalstr; + } + + 1; +} +EOT + + # and run it. + eval $evalstr; + if ($@) { + warn("rules: failed to compile nbody tests, skipping:\n" . "\t($@)\n"); + $self->{rule_errors}++; + } + else { + no strict "refs"; + &{'Mail::SpamAssassin::PerMsgStatus::_nbody_tests_'.$clean_priority}($self, @$textary); + use strict "refs"; + } +} + sub is_user_rule_sub { my ($self, $subname) = @_; return 0 if (eval 'defined &Mail::SpamAssassin::PerMsgStatus::'.$subname); @@ -2424,6 +2553,12 @@ $self->run_eval_tests ($self->{conf}->{body_evals}->{$priority}, 'BODY: ', $bodystring); } +sub do_nbody_eval_tests { + my ($self, $priority, $bodystring) = @_; + return unless (defined($self->{conf}->{nbody_evals}->{$priority})); + $self->run_eval_tests ($self->{conf}->{nbody_evals}->{$priority}, 'NBODY: ', $bodystring); +} + sub do_rawbody_eval_tests { my ($self, $priority, $bodystring) = @_; return unless (defined($self->{conf}->{rawbody_evals}->{$priority})); diff -uNr Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm --- Mail-SpamAssassin-3.1.8.orig/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm 2006-08-30 00:16:47.000000000 +0900 +++ Mail-SpamAssassin-3.1.8/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm 2007-02-15 20:58:39.000000000 +0900 @@ -82,7 +82,7 @@ my $end = $opts->{conf}->{replace_end}; # this is the version-specific code - for my $type (qw|body_tests rawbody_tests head_tests full_tests uri_tests|) { + for my $type (qw|body_tests nbody_tests rawbody_tests head_tests full_tests uri_tests|) { for my $priority (keys %{$opts->{conf}->{$type}}) { while (my ($rule, $re) = each %{$opts->{conf}->{$type}->{$priority}}) { # skip if not listed by replace_rules