diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Bayes.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Bayes.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Bayes.pm 2007-06-08 22:09:55.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Bayes.pm 2007-12-28 10:00:35.000000000 +0900 @@ -220,6 +220,15 @@ # will require a longer token than English ones.) use constant MAX_TOKEN_LENGTH => 15; +# Skip if a token is too short. +our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?: + [\x00-\x7F] # 1 byte + | [\xC0-\xDF][\x80-\xBF] # 2 bytes + | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes + | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes + | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana +)}x; + ########################################################################### sub new { @@ -233,6 +242,7 @@ 'log_raw_counts' => 0, 'use_ignores' => 1, 'tz' => Mail::SpamAssassin::Util::local_tz(), + 'normalize_charset' => $main->{conf}->{normalize_charset}, }; bless ($self, $class); @@ -348,7 +358,7 @@ # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" - tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; + tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs; # DO split on "..." or "--" or "---"; common formatting error resulting in # hapaxes. Keep the separator itself as a token, though, as long ones can @@ -377,6 +387,11 @@ # next if ( defined $magic_re && /$magic_re/ ); + # Skip short UTF-8 tokens. + if ($self->{normalize_charset}) { + next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o); + } + # *do* keep 3-byte tokens; there's some solid signs in there my $len = length($token); @@ -414,14 +429,17 @@ # the domain ".net" appeared in the To header. # if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { - if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { - # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, - # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan - # to me! (jm) - while ($token =~ s/^(..?)//) { - push (@rettokens, "8:$1"); - } - next; + unless ($self->{normalize_charset}) { + # except UTF-8 characters + if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { + # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, + # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan + # to me! (jm) + while ($token =~ s/^(..?)//) { + push (@rettokens, "8:$1"); + } + next; + } } if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) @@ -998,9 +1016,29 @@ $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); + + if ($self->{normalize_charset}) { + my $tokenizer = $self->get_tokenizer($msg); + if (ref($tokenizer)) { + $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body}); + $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz}); + } + } return $msgdata; } +sub get_tokenizer { + my ($self, $msg) = @_; + + my $tokenizer; + my @languages = split(/\s+/, $msg->{msg}->get_language()); + foreach my $lang (@languages) { + $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang}; + last if (ref($tokenizer)); + } + return $tokenizer; +} + ########################################################################### sub sync { diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Conf/Parser.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Conf/Parser.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Conf/Parser.pm 2007-06-08 22:09:54.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Conf/Parser.pm 2007-12-28 10:00:35.000000000 +0900 @@ -698,6 +698,9 @@ # We don't do priorities for $Mail::SpamAssassin::Conf::TYPE_RBL_EVALS $conf->{rbl_evals}->{$name} = \@args; } + elsif ($type == $Mail::SpamAssassin::Conf::TYPE_NBODY_EVALS) { + $conf->{nbody_evals}->{$priority}->{$name} = \@args if ($conf->{normalize_charset}); + } elsif ($type == $Mail::SpamAssassin::Conf::TYPE_RAWBODY_EVALS) { $conf->{rawbody_evals}->{$priority}->{$name} = \@args; } @@ -740,6 +743,9 @@ elsif ($type == $Mail::SpamAssassin::Conf::TYPE_URI_TESTS) { $conf->{uri_tests}->{$priority}->{$name} = $text; } + elsif ($type == $Mail::SpamAssassin::Conf::TYPE_NBODY_TESTS and $conf->{normalize_charset}) { + $conf->{nbody_tests}->{$priority}->{$name} = Encode::decode_utf8($text) unless (Encode::is_utf8($text)); + } elsif ($type == $Mail::SpamAssassin::Conf::TYPE_RAWBODY_TESTS) { $conf->{rawbody_tests}->{$priority}->{$name} = $text; } @@ -839,6 +845,7 @@ # all of these rule types are regexps if ($type == $Mail::SpamAssassin::Conf::TYPE_BODY_TESTS || + $type == $Mail::SpamAssassin::Conf::TYPE_NBODY_TESTS || $type == $Mail::SpamAssassin::Conf::TYPE_FULL_TESTS || $type == $Mail::SpamAssassin::Conf::TYPE_RAWBODY_TESTS || $type == $Mail::SpamAssassin::Conf::TYPE_URI_TESTS) diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Conf.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Conf.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Conf.pm 2007-06-08 22:09:55.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Conf.pm 2007-12-28 10:00:35.000000000 +0900 @@ -91,7 +91,7 @@ $TYPE_HEAD_TESTS $TYPE_HEAD_EVALS $TYPE_BODY_TESTS $TYPE_BODY_EVALS $TYPE_FULL_TESTS $TYPE_FULL_EVALS $TYPE_RAWBODY_TESTS $TYPE_RAWBODY_EVALS $TYPE_URI_TESTS $TYPE_URI_EVALS -$TYPE_META_TESTS $TYPE_RBL_EVALS +$TYPE_META_TESTS $TYPE_RBL_EVALS $TYPE_NBODY_TESTS $TYPE_NBODY_EVALS }; @ISA = qw(); @@ -110,11 +110,13 @@ $TYPE_URI_EVALS = 0x0011; $TYPE_META_TESTS = 0x0012; $TYPE_RBL_EVALS = 0x0013; +$TYPE_NBODY_TESTS = 0x0014; +$TYPE_NBODY_EVALS = 0x0015; my @rule_types = ("body_tests", "uri_tests", "uri_evals", "head_tests", "head_evals", "body_evals", "full_tests", "full_evals", "rawbody_tests", "rawbody_evals", - "rbl_evals", "meta_tests"); + "rbl_evals", "meta_tests", "nbody_tests", "nbody_evals"); $VERSION = 'bogus'; # avoid CPAN.pm picking up version strings later @@ -839,6 +841,18 @@ type => $CONF_TYPE_STRING }); +=item normalize_charset ( 0 | 1) (default: 0) + +If you set this option, messages are checked after UTF-8 encoding conversion. + +=cut + + push (@cmds, { + setting => 'normalize_charset', + default => 0, + type => $CONF_TYPE_BOOL + }); + =back =head2 NETWORK TEST OPTIONS @@ -1951,6 +1965,45 @@ } }); +=item nbody SYMBOLIC_TEST_NAME /pattern/modifiers + +Define a nbody pattern test. C is a Perl regular expression. Note: +as per the header tests, C<#> must be escaped (C<\#>) or else it is considered +the beginning of a comment. + +The 'nbody' in this case is the utf-8 normalized textual parts of the +message body; +any non-text MIME parts are stripped, and the message decoded from +Quoted-Printable or Base-64-encoded format if necessary. The message +Subject header is considered part of the nbody and becomes the first +paragraph when running the rules. All HTML tags and line breaks will +be removed before matching. + +=item nbody SYMBOLIC_TEST_NAME eval:name_of_eval_method([args]) + +Define a nbody eval test. See above. + +=cut + + push (@cmds, { + setting => 'nbody', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + if ($value =~ /^(\S+)\s+eval:(.*)$/) { + $self->{parser}->add_test ($1, $2, $TYPE_NBODY_EVALS); + } + else { + my @values = split(/\s+/, $value, 2); + if (@values != 2) { + return $MISSING_REQUIRED_VALUE; + } + $self->{parser}->add_test (@values, $TYPE_NBODY_TESTS); + } + } + }); + =item uri SYMBOLIC_TEST_NAME /pattern/modifiers Define a uri pattern test. C is a Perl regular expression. Note: as @@ -2803,6 +2856,8 @@ $self->{rawbody_evals} = { }; $self->{meta_tests} = { }; $self->{eval_plugins} = { }; + $self->{nbody_tests} = { }; + $self->{nbody_evals} = { }; # testing stuff $self->{regression_tests} = { }; @@ -3103,6 +3158,7 @@ return 0 if (!defined ($type)); if (($type == $TYPE_BODY_TESTS) || ($type == $TYPE_BODY_EVALS) + || ($type == $TYPE_NBODY_TESTS) || ($type == $TYPE_NBODY_EVALS) || ($type == $TYPE_URI_TESTS) || ($type == $TYPE_URI_EVALS)) { # some rawbody go off of headers... diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/HTML.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/HTML.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/HTML.pm 2007-06-08 22:09:55.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/HTML.pm 2007-12-28 10:00:35.000000000 +0900 @@ -756,7 +756,13 @@ } } else { - $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; + if ( $text =~ /[\x80-\xff]{2}/ ) { + # multibyte string + $text =~ s/[ \t\n\r\f\x0b]+/ /g; + } + else { + $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; + } # trim leading whitespace if previous element was whitespace if (@{ $self->{text} } && defined $self->{text_whitespace} && diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Message/Node.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Message/Node.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Message/Node.pm 2006-09-29 22:06:39.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Message/Node.pm 2007-12-28 10:00:35.000000000 +0900 @@ -42,6 +42,7 @@ use Mail::SpamAssassin::Constants qw(:sa); use Mail::SpamAssassin::HTML; use Mail::SpamAssassin::Logger; +use Mail::SpamAssassin::Util::Charset; =item new() @@ -56,6 +57,7 @@ my $self = { headers => {}, + utf8_headers => {}, raw_headers => {}, body_parts => [], header_order => [] @@ -66,6 +68,7 @@ if (defined $opts->{'subparse'}) { $self->{subparse} = $opts->{'subparse'}; } + $self->{normalize_charset} = $opts->{'normalize_charset'} || 0; bless($self,$class); $self; @@ -181,10 +184,16 @@ if ( !exists $self->{'headers'}->{$key} ) { $self->{'headers'}->{$key} = []; $self->{'raw_headers'}->{$key} = []; + $self->{'utf8_headers'}->{$key} = []; } - push @{ $self->{'headers'}->{$key} }, _decode_header($raw_value); + my ($decoded_value, $utf8_value) = + _decode_header($raw_value, $self->{'normalize_charset'}); + push @{ $self->{'headers'}->{$key} }, $decoded_value; push @{ $self->{'raw_headers'}->{$key} }, $raw_value; + if ($self->{'normalize_charset'}) { + push @{ $self->{'utf8_headers'}->{$key} }, $utf8_value; + } return $self->{'headers'}->{$key}->[-1]; } @@ -234,6 +243,40 @@ } } +=item utf8_header() + +Retrieves the normalized version of headers from a specific MIME part. +The only parameter is the header name. Header names are case-insensitive. + +For retrieval, if utf8_header() is called in an array context, an array +will be returned with each header entry in a different element. In a +scalar context, the last specific header is returned. + +ie: If 'Subject' is specified as the header, and there are 2 Subject +headers in a message, the last/bottom one in the message is returned in +scalar context or both are returned in array context. + +=cut + +# Retrieve utf8 headers from a given MIME object +# +sub utf8_header { + my $self = shift; + my $key = lc(shift); + + # Trim whitespace off of the header keys + $key =~ s/^\s+//; + $key =~ s/\s+$//; + + if (wantarray) { + return unless exists $self->{'utf8_headers'}->{$key}; + return @{ $self->{'utf8_headers'}->{$key} }; + } + else { + return '' unless exists $self->{'utf8_headers'}->{$key}; + return $self->{'utf8_headers'}->{$key}->[-1]; + } +} =item add_body_part() Adds a Node child object to the current node object. @@ -400,11 +443,48 @@ $self->{rendered_type} = $self->{type}; $self->{rendered} = $text; } + + if ($self->{'normalize_charset'}) { + my ($charset, $normalized_text) = + Mail::SpamAssassin::Util::Charset::normalize_charset($self->{'charset'}, $self->{rendered}); + $self->{normalized} = $normalized_text; + $self->{charset} = $charset; + $self->{language} = + Mail::SpamAssassin::Util::Charset::get_language($charset, $normalized_text); + + if ($self->{visible_rendered}) { + my $visible_rendered; + (undef, $visible_rendered) = + Mail::SpamAssassin::Util::Charset::normalize_charset($charset, $self->{visible_rendered}); + $self->{visible_rendered} = $visible_rendered; + } + else { + $self->{visible_rendered} = $self->{'normalized'}; + } + if ($self->{invisible_rendered}) { + my $invisible_rendered; + (undef, $invisible_rendered) = + Mail::SpamAssassin::Util::Charset::normalize_charset($charset, $self->{invisible_rendered}); + $self->{invisible_rendered} = $invisible_rendered; + } + } } return ($self->{rendered_type}, $self->{rendered}); } +=item normalized() + +Render and return the normalized text in this part. + +=cut + +sub normalized { + my ($self) = @_; + $self->rendered(); # ignore return, we want just this: + return ($self->{rendered_type}, $self->{normalized}); +} + =item visible_rendered() Render and return the visible text in this part. @@ -478,6 +558,7 @@ foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) { delete $self->{'headers'}->{$_}; delete $self->{'raw_headers'}->{$_}; + delete $self->{'utf8_headers'}->{$_}; } my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}}); @@ -488,9 +569,10 @@ sub __decode_header { my ( $encoding, $cte, $data ) = @_; + my $decoded_data; if ( $cte eq 'B' ) { # base 64 encoded - return Mail::SpamAssassin::Util::base64_decode($data); + $decoded_data = Mail::SpamAssassin::Util::base64_decode($data); } elsif ( $cte eq 'Q' ) { # quoted printable @@ -498,36 +580,56 @@ # the RFC states that in the encoded text, "_" is equal to "=20" $data =~ s/_/=20/g; - return Mail::SpamAssassin::Util::qp_decode($data); + $decoded_data = Mail::SpamAssassin::Util::qp_decode($data); } else { # not possible since the input has already been limited to 'B' and 'Q' die "message: unknown encoding type '$cte' in RFC2047 header"; } + if ($encoding) { + ($encoding, $decoded_data) = + Mail::SpamAssassin::Util::Charset::normalize_charset($encoding, $decoded_data); + } + return $decoded_data; } # Decode base64 and quoted-printable in headers according to RFC2047. # sub _decode_header { - my($header) = @_; + my($header, $normalize_charset) = @_; - return '' unless $header; + return ('', '') unless $header; # deal with folding and cream the newlines and such $header =~ s/\n[ \t]+/\n /g; $header =~ s/\r?\n//g; - return $header unless $header =~ /=\?/; + my $utf8_header; + unless ($header =~ /=\?/) { + if ($normalize_charset) { + $utf8_header = $header; + if ($header =~ /[\x1b\x80-\xff]/) { + (undef, $utf8_header) = + Mail::SpamAssassin::Util::Charset::normalize_charset(undef, $header); + } + } + return ($header, $utf8_header); + } # multiple encoded sections must ignore the interim whitespace. # to avoid possible FPs with (\s+(?==\?))?, look for the whole RE # separated by whitespace. 1 while ($header =~ s/(=\?[\w_-]+\?[bqBQ]\?[^?]+\?=)\s+(=\?[\w_-]+\?[bqBQ]\?[^?]+\?=)/$1$2/g); + if ($normalize_charset) { + $utf8_header = $header; + $utf8_header =~ + s/=\?([\w_-]+)\?([bqBQ])\?([^?]+)\?=/__decode_header($1, uc($2), $3)/ge; + } $header =~ - s/=\?([\w_-]+)\?([bqBQ])\?([^?]+)\?=/__decode_header($1, uc($2), $3)/ge; + s/=\?([\w_-]+)\?([bqBQ])\?([^?]+)\?=/__decode_header(undef, uc($2), $3)/ge; - return $header; + return ($header, $utf8_header); } =item get_header() @@ -550,20 +652,27 @@ # TODO: this could be made much faster by only processing all headers # when called in array context, otherwise just do one header sub get_header { - my ($self, $hdr, $raw) = @_; + my ($self, $hdr, $raw, $utf8 ) = @_; $raw ||= 0; + $utf8 ||= 0; # And now pick up all the entries into a list # This is assumed to include a newline at the end ... # This is also assumed to have removed continuation bits ... - # Deal with the possibility that header() or raw_header() returns undef + # Deal with the possibility that header(), raw_header() or utf8_header() + # returns undef my @hdrs; if ( $raw ) { if (@hdrs = $self->raw_header($hdr)) { @hdrs = map { s/\r?\n\s+/ /g; $_; } @hdrs; } } + elsif ( $utf8 ) { + if (@hdrs = $self->utf8_header($hdr)) { + @hdrs = map { "$_\n" } @hdrs; + } + } else { if (@hdrs = $self->header($hdr)) { @hdrs = map { "$_\n" } @hdrs; @@ -647,14 +756,18 @@ # Clean up ourself undef $self->{'headers'}; undef $self->{'raw_headers'}; + undef $self->{'utf8_headers'}; undef $self->{'header_order'}; undef $self->{'raw'}; undef $self->{'decoded'}; undef $self->{'rendered'}; + undef $self->{'normalized'}; undef $self->{'visible_rendered'}; undef $self->{'invisible_rendered'}; undef $self->{'type'}; undef $self->{'rendered_type'}; + undef $self->{'charset'}; + undef $self->{'language'}; # Clean up our kids if (exists $self->{'body_parts'}) { diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Message.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Message.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Message.pm 2006-09-29 22:06:40.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Message.pm 2007-12-28 10:00:35.000000000 +0900 @@ -111,6 +111,7 @@ my($opts) = @_; my $message = $opts->{'message'} || \*STDIN; my $parsenow = $opts->{'parsenow'} || 0; + $self->{normalize_charset} = $opts->{'normalize_charset'}; # Specifies whether or not to parse message/rfc822 parts into its own tree. # If the # > 0, it'll subparse, otherwise it won't. By default, do one @@ -544,6 +545,7 @@ delete $self->{pristine_body}; delete $self->{text_decoded}; delete $self->{text_rendered}; + delete $self->{text_normalized}; # Destroy the tree ... $self->SUPER::finish(); @@ -659,7 +661,7 @@ } # prepare a new tree node - my $part_msg = Mail::SpamAssassin::Message::Node->new({ subparse=>$msg->{subparse}-1 }); + my $part_msg = Mail::SpamAssassin::Message::Node->new({ subparse=>$msg->{subparse}-1, normalize_charset=>$self->{normalize_charset} }); my $in_body = 0; my $header; my $part_array; @@ -706,7 +708,7 @@ # make sure we start with a new clean node $in_body = 0; - $part_msg = Mail::SpamAssassin::Message::Node->new({ subparse=>$msg->{subparse}-1 }); + $part_msg = Mail::SpamAssassin::Message::Node->new({ subparse=>$msg->{subparse}-1, normalize_charset=>$self->{normalize_charset} }); undef $part_array; undef $header; @@ -774,6 +776,7 @@ # 0: content-type, 1: boundary, 2: charset, 3: filename my @ct = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type')); $part_msg->{'type'} = $ct[0]; + $part_msg->{'charset'} = $ct[2]; # multipart sections are required to have a boundary set ... If this # one doesn't, assume it's malformed and revert to text/plain @@ -802,6 +805,7 @@ message => $message, parsenow => 1, subparse => $msg->{subparse}-1, + normalize_charset => $self->{normalize_charset}, }); # main message is a message/* part ... @@ -922,7 +926,7 @@ my $html_needs_setting = !exists $self->{metadata}->{html}; # Go through each part - my $text = $self->get_header ('subject') || ''; + my $text = $self->get_header ('subject', undef, $self->{normalize_charset}) || ''; for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { my $p = $parts[$pt]; @@ -953,7 +957,14 @@ # whitespace handling (warning: small changes have large effects!) $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed - $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + if ($self->{normalize_charset}) { + $text =~ s/\xc2\xa0/ /g; # no-break space => space + $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space + $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space + } + else { + $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + } $text =~ tr/\f/\n/; # form feeds => newline my @textary = split_into_array_of_short_lines ($text); @@ -1008,7 +1019,14 @@ # whitespace handling (warning: small changes have large effects!) $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed - $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + if ($self->{normalize_charset}) { + $text =~ s/\xc2\xa0/ /g; # no-break space => space + $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space + $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space + } + else { + $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + } $text =~ tr/\f/\n/; # form feeds => newline my @textary = split_into_array_of_short_lines ($text); @@ -1019,6 +1037,58 @@ # --------------------------------------------------------------------------- +sub get_normalized_body_text_array { + my ($self) = @_; + + if (exists $self->{text_normalized}) { return $self->{text_normalized}; } + + $self->{text_normalized} = []; + + # Find all parts which are leaves + my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); + return $self->{text_normalized} unless @parts; + + # the html metadata may have already been set, so let's not bother if it's + # already been done. + my $html_needs_setting = !exists $self->{metadata}->{html}; + + # Go through each part + my $text = $self->get_header ('subject', undef, 1) || ''; + my @langs; + for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { + my $p = $parts[$pt]; + + # put a blank line between parts ... + $text .= "\n" if ( $text ); + + my($type, $rnd) = $p->normalized(); # decode this part + if ( defined $rnd ) { + # Only text/* types are normalized ... + $text .= $rnd; + + if ($html_needs_setting && $type eq 'text/html') { + $self->{metadata}->{html} = $p->{html_results}; + } + } + else { + $text .= $p->decode(); + } + } + + # whitespace handling (warning: small changes have large effects!) + $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed + $text =~ s/\xc2\xa0/ /g; # no-break space => space + $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space + $text =~ tr/\f/\n/; # form feeds => newline + + my @textary = split_into_array_of_short_lines ($text); + $self->{text_normalized} = \@textary; + + return $self->{text_normalized}; +} + +# --------------------------------------------------------------------------- + sub get_decoded_body_text_array { my ($self) = @_; @@ -1044,6 +1114,27 @@ # --------------------------------------------------------------------------- +sub get_language { + my ($self) = @_; + + if (defined $self->{language}) { return $self->{language}; } + my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); + return '' unless @parts; + + # Go through each part + my @langs; + for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { + my $p = $parts[$pt]; + my $lang = $p->{language}; + next unless ($lang); + push(@langs, $lang) unless (grep(/^$lang$/, @langs)) + } + $self->{language} = scalar(@langs) ? join(' ', @langs) : ''; + return $self->{language}; +} + +# --------------------------------------------------------------------------- + sub split_into_array_of_short_lines { my @result = (); foreach my $line (split (/^/m, $_[0])) { diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/PerMsgStatus.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/PerMsgStatus.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/PerMsgStatus.pm 2007-06-08 22:09:55.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/PerMsgStatus.pm 2007-12-28 10:00:35.000000000 +0900 @@ -85,6 +85,7 @@ 'disable_auto_learning' => 0, 'auto_learn_status' => undef, 'conf' => $main->{conf}, + 'normalize_charset' => $main->{conf}->{normalize_charset}, }; if (defined $opts && $opts->{disable_auto_learning}) { @@ -160,6 +161,7 @@ my $needs_dnsbl_harvest_p = 1; # harvest needs to be run my $decoded = $self->get_decoded_stripped_body_text_array(); + my $normalized = $self->get_normalized_body_text_array(); my $bodytext = $self->get_decoded_body_text_array(); my $fulltext = $self->{msg}->get_pristine(); @@ -199,6 +201,11 @@ $self->do_body_tests($priority, $decoded); $self->do_body_uri_tests($priority, @uris); $self->do_body_eval_tests($priority, $decoded); + + if ($self->{normalize_charset}) { + $self->do_nbody_tests($priority, $normalized); + $self->do_nbody_eval_tests($priority, $normalized); + } $self->do_rawbody_tests($priority, $bodytext); $self->do_rawbody_eval_tests($priority, $bodytext); @@ -226,6 +233,7 @@ # finished running rules delete $self->{current_rule_name}; undef $decoded; + undef $normalized; undef $bodytext; undef $fulltext; @@ -599,7 +607,13 @@ my ($self) = @_; my $str = ''; - my $ary = $self->get_decoded_stripped_body_text_array(); + my $ary; + if ($self->{'normalize_charset'}) { + $ary = $self->get_normalized_body_text_array(); + } + else { + $ary = $self->get_decoded_stripped_body_text_array(); + } shift @{$ary}; # drop the subject line my $numlines = 3; @@ -829,6 +843,17 @@ my $description = $self->{main}->{'encapsulated_content_description'}; + if ($self->{normalize_charset}) { + my $cs = 'utf8'; + if ($self->{conf}->{report_charset}) { + $cs = $self->{conf}->{report_charset}; + Encode::from_to($report, 'utf8', $cs); + } + else { + $report_charset = "; charset=UTF-8"; + } + } + # Note: the message should end in blank line since mbox format wants # blank line at end and messages may be concatenated! In addition, the # x-spam-type parameter is fixed since we will use it later to recognize @@ -962,8 +987,11 @@ return $text unless ($text =~ /[\x80-\xff]/); my $cs = 'ISO-8859-1'; - if ($self->{report_charset}) { - $cs = $self->{report_charset}; + if ($self->{conf}->{report_charset}) { + $cs = $self->{conf}->{report_charset}; + } + if ($self->{normalize_charset}) { + Encode::from_to($text, 'utf8', $cs); } my @hexchars = split('', '0123456789abcdef'); @@ -1316,6 +1344,9 @@ if (defined &{'_body_uri_tests_'.$clean_priority}) { undef &{'_body_uri_tests_'.$clean_priority}; } + if (defined &{'_nbody_tests_'.$clean_priority}) { + undef &{'_nbody_tests_'.$clean_priority}; + } if (defined &{'_rawbody_tests_'.$clean_priority}) { undef &{'_rawbody_tests_'.$clean_priority}; } @@ -1387,6 +1418,10 @@ return $_[0]->{msg}->get_rendered_body_text_array(); } +sub get_normalized_body_text_array { + return $_[0]->{msg}->get_normalized_body_text_array(); +} + ########################################################################### =item $status->get (header_name [, default_value]) @@ -1477,12 +1512,16 @@ my $getaddr = 0; my $getname = 0; my $getraw = 0; + my $getutf8 = 0; # special queries if (index($request, ':') != -1) { $getaddr = ($request =~ s/:addr$//); $getname = ($request =~ s/:name$//); $getraw = ($request =~ s/:raw$//); + if ($self->{normalize_charset}) { + $getutf8 = ($request =~ s/:utf8$//); + } } # ALL: entire raw headers @@ -1511,26 +1550,26 @@ } # ToCc: the combined recipients list elsif ($request eq 'ToCc') { - $result = join("\n", $self->{msg}->get_header('To', $getraw)); + $result = join("\n", $self->{msg}->get_header('To', $getraw, $getutf8)); if ($result) { chomp $result; $result .= ", " if $result =~ /\S/; } - $result .= join("\n", $self->{msg}->get_header('Cc', $getraw)); + $result .= join("\n", $self->{msg}->get_header('Cc', $getraw, $getutf8)); $result = undef if !$result; } # MESSAGEID: handle lists which move the real message-id to another # header for resending. elsif ($request eq 'MESSAGEID') { $result = join("\n", grep { defined($_) && length($_) > 0 } - $self->{msg}->get_header('X-Message-Id', $getraw), - $self->{msg}->get_header('Resent-Message-Id', $getraw), - $self->{msg}->get_header('X-Original-Message-ID', $getraw), - $self->{msg}->get_header('Message-Id', $getraw)); + $self->{msg}->get_header('X-Message-Id', $getraw, $getutf8), + $self->{msg}->get_header('Resent-Message-Id', $getraw, $getutf8), + $self->{msg}->get_header('X-Original-Message-ID', $getraw, $getutf8), + $self->{msg}->get_header('Message-Id', $getraw, $getutf8)); } # a conventional header else { - $result = join('', $self->{msg}->get_header($request, $getraw)); + $result = join('', $self->{msg}->get_header($request, $getraw, $getutf8)); # metadata if (!$result) { @@ -1831,6 +1870,96 @@ } } +sub do_nbody_tests { + my ($self, $priority, $textary) = @_; + local ($_); + + dbg("rules: running nbody-text per-line regexp tests; score so far=".$self->{score}); + + my $doing_user_rules = + $self->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_NBODY_TESTS}; + + # clean up priority value so it can be used in a subroutine name + my $clean_priority; + ($clean_priority = $priority) =~ s/-/neg/; + + $self->{test_log_msgs} = (); # clear test state + if (defined &{'Mail::SpamAssassin::PerMsgStatus::_nbody_tests_'.$clean_priority} + && !$doing_user_rules) { + no strict "refs"; + &{'Mail::SpamAssassin::PerMsgStatus::_nbody_tests_'.$clean_priority}($self, @$textary); + use strict "refs"; + return; + } + + # build up the eval string... + my $evalstr = ''; + my $evalstr2 = ''; + + while (my($rulename, $pat) = each %{$self->{conf}{nbody_tests}->{$priority}}) { + $evalstr .= ' + if ($self->{conf}->{scores}->{q{'.$rulename.'}}) { + # call procedurally as it is faster. + '.$rulename.'_nbody_test($self,@_); + } + '; + + if ($doing_user_rules) { + next if (!$self->is_user_rule_sub ($rulename.'_nbody_test')); + } + + $evalstr2 .= ' + sub '.$rulename.'_nbody_test { + my $self = shift; + foreach (@_) { + '.$self->hash_line_for_rule($rulename).' + if ('.$pat.') { + $self->got_pattern_hit(q{'.$rulename.'}, "NBODY: "); + '. $self->ran_rule_debug_code($rulename, "nbody", 2) . ' + # Ok, we hit, stop now. + last; + } + } + } + '; + } + + # clear out a previous version of this fn, if already defined + if (defined &{'_nbody_tests_'.$clean_priority}) { + undef &{'_nbody_tests_'.$clean_priority}; + } + + return unless ($evalstr); + + # generate the loop that goes through each line... + $evalstr = <<"EOT"; +{ + package Mail::SpamAssassin::PerMsgStatus; + + $evalstr2; + + sub _nbody_tests_$clean_priority { + my \$self = shift; + $evalstr; + } + + 1; +} +EOT + + # and run it. + eval $evalstr; + if ($@) { + warn("rules: failed to compile nbody tests, skipping:\n" . "\t($@)\n"); + $self->{rule_errors}++; + } + else { + no strict "refs"; + &{'Mail::SpamAssassin::PerMsgStatus::_nbody_tests_'.$clean_priority}($self, @$textary); + use strict "refs"; + } +} + sub is_user_rule_sub { my ($self, $subname) = @_; return 0 if (eval 'defined &Mail::SpamAssassin::PerMsgStatus::'.$subname); @@ -2424,6 +2553,12 @@ $self->run_eval_tests ($self->{conf}->{body_evals}->{$priority}, 'BODY: ', $bodystring); } +sub do_nbody_eval_tests { + my ($self, $priority, $bodystring) = @_; + return unless (defined($self->{conf}->{nbody_evals}->{$priority})); + $self->run_eval_tests ($self->{conf}->{nbody_evals}->{$priority}, 'NBODY: ', $bodystring); +} + sub do_rawbody_eval_tests { my ($self, $priority, $bodystring) = @_; return unless (defined($self->{conf}->{rawbody_evals}->{$priority})); diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm 2006-09-29 22:06:39.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm 2007-12-28 10:00:35.000000000 +0900 @@ -82,7 +82,7 @@ my $end = $opts->{conf}->{replace_end}; # this is the version-specific code - for my $type (qw|body_tests rawbody_tests head_tests full_tests uri_tests|) { + for my $type (qw|body_tests nbody_tests rawbody_tests head_tests full_tests uri_tests|) { for my $priority (keys %{$opts->{conf}->{$type}}) { while (my ($rule, $re) = each %{$opts->{conf}->{$type}->{$priority}}) { # skip if not listed by replace_rules diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 1970-01-01 09:00:00.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2007-12-28 10:00:35.000000000 +0900 @@ -0,0 +1,81 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Tokenizer::MeCab - Japanese tokenizer with MeCab + +=head1 SYNOPSIS + +loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab + +=head1 DESCRIPTION + +This plugin tokenizes a Japanese string with MeCab that is +the morphological analysis engine. + +Text::MeCab 0.12 or over is required. + +=cut + +package Mail::SpamAssassin::Plugin::Tokenizer::MeCab; + +use Mail::SpamAssassin::Plugin::Tokenizer; +use strict; +use warnings; +use bytes; + +use vars qw(@ISA); +@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); + +# Have to do this so that RPM doesn't find these as required perl modules +BEGIN { require Text::MeCab; } +our $language = 'ja'; + +sub new { + my $class = shift; + my $mailsaobject = shift; + + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject, $language); + bless ($self, $class); + + return $self; +} + +sub tokenize { + my $self = shift; + my $text_array = shift; + + my $mecab = Text::MeCab->new(); + my @tokenized_array; + foreach my $text (@$text_array) { + unless ($text and $text =~ /[\x80-\xFF]/) { + push(@tokenized_array, $text); + next; + } + + my @buf; + for (my $node = $mecab->parse($text); $node; $node = $node->next) { + push(@buf, $node->surface); + } + my $tokenized = join(' ', @buf); + push(@tokenized_array, $tokenized); + } + return \@tokenized_array; +} + +1; diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 1970-01-01 09:00:00.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2007-12-28 10:00:35.000000000 +0900 @@ -0,0 +1,99 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Tokenizer::SimpleJA - simple Japanese tokenizer + +=head1 SYNOPSIS + +loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA + +=head1 DESCRIPTION + +This plugin simply tokenizes a Japanese string by characters other than +the alphabet, the Chinese character, and the katakana. + +=cut + +package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA; + +use Mail::SpamAssassin::Plugin::Tokenizer; +use strict; +use warnings; +use bytes; + +use vars qw(@ISA); +@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); + +our $language = 'ja'; + +our $RE = qr{( + # ASCII + [\x00-\x7F]+ + # Katakana + | (?: + \xE3\x82[\xA0-\xBF] + | \xE3\x83[\x80-\xBF] + )+ + # Kanji + | (?: + \xE3[\x90-\xBF][\x80-\xBF] + | [\xE4-\xE9][\x80-\xBF]{2} + | \xEF[\xA4-\xAB][\x80-\xBF] + )+ + # Others + | [\xC0-\xDF][\x80-\xBF] + | [\xE0-\xE2][\x80-\xBF]{2} + | \xE3[\x80-\x81][\x80-\xBF] + | \xE3\x82[\x80-\x9F] + | \xE3[\x84-\x8F][\x80-\xBF] + | [\xEA-\xEE][\x80-\xBF]{2} + | \xEF[\x80-\xA3][\x80-\xBF] + | \xEF[\xAC-\xBF][\x80-\xBF] + | [\xF0-\xF7][\x80-\xBF]{3} +)}x; + +sub new { + my $class = shift; + my $mailsaobject = shift; + + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject, $language); + bless ($self, $class); + + return $self; +} + +sub tokenize { + my $self = shift; + my $text_array = shift; + + my @tokenized_array; + foreach my $text (@$text_array) { + unless ($text and $text =~ /[\x80-\xFF]/) { + push(@tokenized_array, $text); + next; + } + + my $tokenized = $text; + $tokenized =~ s/$RE/$1 /og; + push(@tokenized_array, $tokenized); + } + return \@tokenized_array; +} + +1; diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/Tokenizer.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/Tokenizer.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 1970-01-01 09:00:00.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2007-12-28 10:00:35.000000000 +0900 @@ -0,0 +1,114 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class + +=head1 SYNOPSIS + +=head2 SpamAssassin configuration: + + loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm + +=head2 Perl code: + + use Mail::SpamAssassin::Plugin::Tokenizer; + use vars qw(@ISA); + @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); + # language to use this plugin + our $language = 'ja'; + + # constructor: register language + sub new { + my $class = shift; + my $mailsaobject = shift; + + # some boilerplate... + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject, $language); + bless ($self, $class); + + return $self; + } + + # tokenize function + sub tokenize { + my $self = shift; + my $text_array_ref = shift; + + ...... + + return $tokenized_array_ref; + } + + +=head1 DESCRIPTION + +This plugin is the base class of tokenizer plugin. +You must define tokenize() and $language + +=head1 INTERFACE + + sub tokenize { + my $self = shift; + my $text_array_ref = shift; + + ...... + + return $tokenized_array_ref; + } + +=cut + +package Mail::SpamAssassin::Plugin::Tokenizer; + +use Mail::SpamAssassin::Plugin; +use Mail::SpamAssassin::Logger; +use strict; +use warnings; +use bytes; + +use vars qw(@ISA); +@ISA = qw(Mail::SpamAssassin::Plugin); + +sub new { + my $class = shift; + my $mailsaobject = shift; + my $language = shift; + + # some boilerplate... + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject); + bless ($self, $class); + + if ($language) { + $self->{main}->{conf}->{tokenizer}->{$language} = $self; + } + else { + dbg("plugin: $self: \$language is not defined"); + } + + return $self; +} + +sub tokenize { + my ($self, $ref) = @_; + + return $ref; +} + +1; diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Util/Charset.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Util/Charset.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Util/Charset.pm 1970-01-01 09:00:00.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Util/Charset.pm 2007-12-28 10:00:35.000000000 +0900 @@ -0,0 +1,423 @@ +# <@LICENSE> +# Copyright 2006 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +=head1 NAME + + Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language + +=head1 SYNOPSIS + + my ($detected, $decoded) = Mail::SpamAssassin::Util::Charset::normalize_charset($charset, $str); + my $language = Mail::SpamAssassin::Util::Charset::get_language($charset, $str); + +=head1 DESCRIPTION + +This module implements utility methods for charset and language. + +=cut + +package Mail::SpamAssassin::Util::Charset; + +use strict; +use warnings; + +use vars qw ( + @ISA @EXPORT +); + +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(normalize_charset get_language); + +########################################################################### + +use constant HAS_ENCODE => eval { require Encode; require Encode::Guess; require Encode::Alias; }; +use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; +use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; + +########################################################################### + +our $KANA_HAN_RE = qr{ + # Hiragana and Katakana + \xE3[\x81-\x83][\x80-\xBF] + # Han + | \xE3[\x90-\xBF][\x80-\xBF] + | [\xE4-\xE9][\x80-\xBF]{2} + | \xEF[\xA4-\xAB][\x80-\xBF] +}x; + +our %enc2lang; +our %lang2enc; +our %scr2lang; +our %cjkscr2lang; +our @scrorder; + +BEGIN { + + # See the following URL about this map: + # http://czyborra.com/charsets/iso8859.html + # http://czyborra.com/charsets/codepages.html + # http://czyborra.com/charsets/cyrillic.html + # http://en.wikipedia.org/wiki/ISO_8859 + # http://www.w3.org/International/O-charset-lang.html + %enc2lang = ( + # buint-in Encodings and Encode::Byte + # N. America + 'ascii' => 'en', + 'cp437' => 'en', + 'cp863' => 'weurope', + + # W. Europe (Latin1, Latin9) + # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af + 'iso-8859-1' => 'weurope', + 'iso-8859-15' => 'weurope', + 'cp850' => 'weurope', + 'cp860' => 'weurope', + 'cp1252' => 'weurope', + 'MacRoman' => 'weurope', + + # Cntrl. Europe / Latin2 / Latin10 + # hr cs hu pl sr sk sl + 'iso-8859-2' => 'ceurope', + 'cp852' => 'ceurope', + 'cp1250' => 'ceurope', + 'MacCentralEurRoman' => 'ceurope', + 'MacCroatian' => 'ceurope', + 'iso-8859-16' => 'ceurope', + 'MacRomanian' => 'ceurope', + + # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) + # eo mt + 'iso-8859-3' => 'seurope', + + # Baltics (Latin4, Latin7) + # lv lt + 'iso-8859-4' => 'neurope', + 'iso-8859-13' => 'baltic', + 'cp1257' => 'baltic', + + # Nordics (Latin6) + # et kl iu se + 'iso-8859-10' => 'nordic', + + # Cyrillics + # bg be uk sr mk ru + 'iso-8859-5' => 'ru', + 'cp855' => 'ru', + 'cp1251' => 'ru', + 'cp866' => 'ru', + 'MacCyrillic' => 'ru', + 'koi8-r' => 'ru', + 'MacUkrainian' => 'uk', + 'koi8-u' => 'uk', + + # Arabic + 'iso-8859-6' => 'ar', + 'cp864' => 'ar', + 'cp1256' => 'ar', + 'MacArabic' => 'ar', + 'cp1006' => 'fa', + 'MacFarsi' => 'fa', + + # Greek + 'iso-8859-7' => 'el', + 'cp1253' => 'el', + 'MacGreek' => 'el', + + # Hebrew + # he yi + 'iso-8859-8' => 'he', + 'cp862' => 'he', + 'cp1255' => 'he', + 'MacHebrew' => 'he', + + # Turkish + 'iso-8859-9' => 'tr', + 'cp857' => 'tr', + 'cp1254' => 'tr', + 'MacTurkish' => 'tr', + + # Thai + 'iso-8859-11' => 'th', + 'cp874' => 'th', + + # Celtics (Latin8) + # gd cy br + 'iso-8859-14' => 'celtic', + + # Vietnamese + 'viscii' => 'vi', + 'cp1258' => 'vi', + + # Encode::CN + 'euc-cn' => 'zh', + 'cp936' => 'zh', + 'hz' => 'zh', + + # Encode::TW + 'big5-eten' => 'zh', + 'big5-hkscs' => 'zh', + 'cp950' => 'zh', + + # Encode::JP + 'euc-jp' => 'ja', + 'shiftjis' => 'ja', + '7bit-jis' => 'ja', + 'iso-2022-jp' => 'ja', + 'iso-2022-jp-1' => 'ja', + 'cp932' => 'ja', + + # Encode::KR + 'euc-kr' => 'ko', + 'cp949' => 'ko', + 'johab' => 'ko', + 'iso-2022-kr' => 'ko', + + # Encode::HanExtra + 'euc-tw' => 'zh', + 'gb18030' => 'zh', + + # Encode::JIS2K + 'euc-jisx0213' => 'ja', + 'shiftjisx0123' => 'ja', + 'iso-2022-jp-3' => 'ja', + + ); + + %lang2enc = ( + # Latin1 + 'en' => ['ascii'], + 'weurope' => ['cp1252'], + + # Latin2 + 'ceurope' => ['cp1250'], + + # Latin3 + 'seurope' => ['iso-8859-3'], + + # Latin4 + 'neurope' => ['iso-8859-4'], + + # Latin5 + 'tr' => ['cp1254'], + + # Latin6 + 'nordic' => ['iso-8859-10'], + + # Latin7 + 'baltic' => ['cp1257'], + + # Latin8 + 'celtic' => ['iso-8859-14'], + + # Non Latin + 'ru' => ['koi8-r', 'cp1251'], + 'uk' => ['koi8-u'], + + 'ar' => ['cp1256'], + 'el' => ['cp1253'], + 'he' => ['cp1255'], + 'th' => ['cp874'], + 'vi' => ['viscii', 'cp1258'], + 'zh' => ['euc-cn', 'cp950'], + 'ja' => ['euc-jp', 'cp932'], + 'ko' => ['euc-kr', 'cp949'], + + ); + + %scr2lang = ( + 'InLatin1Supplement' => ['weurope'], + 'InLatinExtendedA' => [ + 'ceurope', + 'seurope', + 'tr', + 'vi' + ], + 'InLatinExtendedB' => [ + 'nordic', + 'baltic', + 'celtic' + ], + 'Thai' => ['th'], + 'Cyrillic' => ['ru', 'uk'], + 'Arabic' => ['ar'], + 'Greek' => ['el'], + 'Hebrew' => ['he'], + ); + + # better detection for CJK + @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); + %cjkscr2lang = ( + 'Hiragana' => ['ja'], + 'Katakana' => ['ja'], + 'Hangul' => ['ko'], + 'Han' => ['zh', 'ja', 'ko'], + ); + + if (HAS_ENCODE) { + unless (HAS_ENCODE_HANEXTRA) { + Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); + } + Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); + Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); + Encode::Alias::define_alias( qr/^Shift_JIS$/ => ' "cp932"' ); + } +} + +sub get_language { + my $charset = shift; + my $str = shift; # $str must be UTF-8 encoding + + return 'en' unless HAS_ENCODE; + return 'en' unless $charset; + if ($charset !~ /^utf/i) { + return $enc2lang{$charset}; + } elsif (defined($str)) { + $str =~ s/[\x00-\x7F]//g; # remove ASCII characters + return 'en' if ($str eq ''); + + my %handled; + $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); + foreach my $scr (@scrorder) { + next if ($str !~ /\p{$scr}/); + my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; + foreach my $lang (@$scrlangs) { + next if (exists($handled{$lang})); + foreach my $enc (@{$lang2enc{$lang}}) { + my $scratch = $str; + Encode::encode($enc, $scratch, Encode::FB_QUIET); + return $lang if ($scratch eq ''); + } + $handled{$lang} = 1; + } + } + } + return 'en'; +} + + +# TEST 1: try conversion to use the specified charset. +# TEST 2: try conversion to use Encode::Detect. +# TEST 3: try conversion to use Encode::Guess. +sub normalize_charset { + my $charset = shift; + my $str = shift; + + return ($charset, $str) unless HAS_ENCODE; + return ('ascii', $str) unless ($str); + + my $decoded; + my $detected; + + if ($charset) { + ($detected, $decoded) = _specified_encoding($charset, $str); + } + unless ($detected) { + ($detected, $decoded) = _encode_detect($str); + } + unless ($detected) { + ($detected, $decoded) = _encode_guess($charset, $str); + } + unless ($detected) { + return (undef, $str); + } + $decoded = Encode::encode_utf8($decoded); + + # unfold hiragana, katakana and han + if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949$)/i) { + $decoded =~ s/($KANA_HAN_RE)\r?\n($KANA_HAN_RE)/$1$2/og; + } + return ($detected, $decoded); +} + +sub _specified_encoding { + my $encoding = shift; + my $str = shift; + + my $detected; + my $decoded; + # note: ISO-2022-* is not deistinguish from US-ASCII + if ($encoding and ($str !~ /\e/ or $encoding =~ /^ISO-2022/i)) { + my $encoder = Encode::find_encoding($encoding); + if (ref($encoder)) { + $decoded = $encoder->decode($str,Encode::FB_QUIET); + $detected = $encoder->name if ($str eq ''); + } + } + return ($detected, $decoded); +} + +sub _encode_detect { + return undef unless HAS_ENCODE_DETECT; + my $str = shift; + + my $decoded; + my $detected = Encode::Detect::Detector::detect($str); + if ($detected) { + my $encoder = Encode::find_encoding($detected); + if (ref($encoder)) { + $decoded = $encoder->decode($str); + $detected = $decoded ? $encoder->name : undef; + } + else { + $detected = undef; + } + } + return ($detected, $decoded); +} + +sub _encode_guess { + my $encoding = shift; + my $str = shift; + + my $detected; + my $decoded; + my $encoder; + if ($encoding) { + $encoding = Encode::resolve_alias($encoding); + if ($encoding) { + $encoder = Encode::Guess::guess_encoding($str, $encoding); + } + } + unless (ref($encoder)) { + if ($str =~ /\e/) { + $encoder = Encode::Guess::guess_encoding($str, qw/7bit-jis iso-2022-kr/); + } + elsif ($str =~ /[\x80-\xFF]{4}/) { + $encoder = Encode::Guess::guess_encoding($str, + qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); + } + else { + $encoder = Encode::Guess::guess_encoding($str, qw/iso-8859-1 cp1252/); + } + } + if (ref($encoder)) { + $detected = $encoder->name; + if ($detected =~ /^UTF-(?:16|32)[BL]E$/i) { + # The guessed UTF-16|32 encoding without BOM cannot be trusted. + $detected = undef; + } + if ($detected) { + $decoded = $encoder->decode($str); + } + } + return ($detected, $decoded); +} + +1; + diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Util/DependencyInfo.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Util/DependencyInfo.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin/Util/DependencyInfo.pm 2006-09-29 22:06:38.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin/Util/DependencyInfo.pm 2007-12-28 10:00:35.000000000 +0900 @@ -169,6 +169,12 @@ desc => 'The "sa-update" script requires this module to access compressed update archive files.', }, +{ + module => 'Encode::Detect', + version => '0.00', + desc => 'If this module is installed, SpamAssassin will detect charsets + and convert them into Unicode.', +}, ); ########################################################################### diff -uNr Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin.pm Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin.pm --- Mail-SpamAssassin-3.1.9.orig/lib/Mail/SpamAssassin.pm 2007-06-08 22:27:53.000000000 +0900 +++ Mail-SpamAssassin-3.1.9/lib/Mail/SpamAssassin.pm 2007-12-28 10:00:35.000000000 +0900 @@ -401,7 +401,9 @@ sub parse { my($self, $message, $parsenow) = @_; - my $msg = Mail::SpamAssassin::Message->new({message=>$message, parsenow=>$parsenow}); + $self->init(1); + my $normalize_charset = $self->{'conf'}->{'normalize_charset'}; + my $msg = Mail::SpamAssassin::Message->new({message=>$message, parsenow=>$parsenow, normalize_charset=>$normalize_charset}); return $msg; }