diff -uNr Mail-SpamAssassin-3.2.3.orig/lib/Mail/SpamAssassin/HTML.pm Mail-SpamAssassin-3.2.3/lib/Mail/SpamAssassin/HTML.pm --- Mail-SpamAssassin-3.2.3.orig/lib/Mail/SpamAssassin/HTML.pm 2007-08-08 22:19:15.000000000 +0900 +++ Mail-SpamAssassin-3.2.3/lib/Mail/SpamAssassin/HTML.pm 2007-12-27 15:59:40.000000000 +0900 @@ -86,7 +86,9 @@ $ok_attributes{span}{$_} = 1 for qw( style ); sub new { - my ($class) = @_; + my $class = shift; + my $opts = shift; + my $self = $class->SUPER::new( api_version => 3, handlers => [ @@ -99,6 +101,7 @@ declaration => ["html_declaration", "self,text"], ], marked_sections => 1); + $self->{normalize} = $opts->{normalize} || 0; $self; } @@ -680,7 +683,14 @@ } } else { - $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; + if ($self->{normalize}) { + $text =~ s/\xc2\xa0/ /g; # no-break space + $text =~ s/\xe3\x80\x80/ /g; # ideographicspace + $text =~ s/[ \t\n\r\f\x0b]+/ /g; + } + else { + $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; + } # trim leading whitespace if previous element was whitespace if (@{ $self->{text} } && defined $self->{text_whitespace} && diff -uNr Mail-SpamAssassin-3.2.3.orig/lib/Mail/SpamAssassin/Message/Node.pm Mail-SpamAssassin-3.2.3/lib/Mail/SpamAssassin/Message/Node.pm --- Mail-SpamAssassin-3.2.3.orig/lib/Mail/SpamAssassin/Message/Node.pm 2007-08-08 22:19:11.000000000 +0900 +++ Mail-SpamAssassin-3.2.3/lib/Mail/SpamAssassin/Message/Node.pm 2007-12-27 15:59:40.000000000 +0900 @@ -41,6 +41,7 @@ use Mail::SpamAssassin::Constants qw(:sa); use Mail::SpamAssassin::HTML; use Mail::SpamAssassin::Logger; +use Mail::SpamAssassin::Util::Charset; =item new() @@ -378,27 +379,10 @@ sub _normalize { my ($self, $data, $charset) = @_; - return $data unless $self->{normalize}; + return wantarray ? ($data, $charset) : $data unless $self->{normalize}; - my $detected = Encode::Detect::Detector::detect($data); - - my $converter; - - if ($charset && $charset !~ /^us-ascii$/i && - ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) { - dbg("Using labeled charset $charset"); - $converter = Encode::find_encoding($charset); - } - - $converter = Encode::find_encoding($detected) unless $converter || !defined($detected); - - return $data unless $converter; - - dbg("Converting..."); - - my $rv = $converter->decode($data, 0); - utf8::downgrade($rv, 1); - return $rv + my ($decoded_data, $detected_charset) = normalize_charset($data, $charset); + return wantarray ? ($decoded_data, $detected_charset) : $decoded_data; } =item rendered() @@ -421,8 +405,12 @@ # text/x-aol is ignored here, but looks like text/html ... return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); - my $text = $self->_normalize($self->decode(), $self->{charset}); + my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset}); my $raw = length($text); + if ($self->{normalize}) { + $self->{charset} = $charset; + $self->{language} = get_language($text, $charset); + } # render text/html always, or any other text|text/plain part as text/html # based on a heuristic which simulates a certain common mail client @@ -432,7 +420,7 @@ { $self->{rendered_type} = 'text/html'; - my $html = Mail::SpamAssassin::HTML->new(); # object + my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}}); # object $html->parse($text); # parse+render text $self->{rendered} = $html->get_rendered_text(); $self->{visible_rendered} = $html->get_rendered_text(invisible => 0); diff -uNr Mail-SpamAssassin-3.2.3.orig/lib/Mail/SpamAssassin/Message.pm Mail-SpamAssassin-3.2.3/lib/Mail/SpamAssassin/Message.pm --- Mail-SpamAssassin-3.2.3.orig/lib/Mail/SpamAssassin/Message.pm 2007-08-08 22:19:15.000000000 +0900 +++ Mail-SpamAssassin-3.2.3/lib/Mail/SpamAssassin/Message.pm 2007-12-27 15:59:40.000000000 +0900 @@ -538,6 +538,8 @@ delete $self->{'pristine_headers'}; delete $self->{'line_ending'}; delete $self->{'missing_head_body_separator'}; + delete $self->{'charset'}; + delete $self->{'language'}; my @toclean = ( $self ); @@ -553,6 +555,8 @@ delete $part->{'invisible_rendered'}; delete $part->{'type'}; delete $part->{'rendered_type'}; + delete $self->{'charset'}; + delete $self->{'language'}; # if there are children nodes, add them to the queue of nodes to clean up if (exists $part->{'body_parts'}) { @@ -933,7 +937,14 @@ # whitespace handling (warning: small changes have large effects!) $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed - $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + if ($self->{normalize}) { + $text =~ s/\xc2\xa0/ /g; # no-break space => space + $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space + $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space + } + else { + $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + } $text =~ tr/\f/\n/; # form feeds => newline # warn "message: $text"; @@ -990,7 +1001,14 @@ # whitespace handling (warning: small changes have large effects!) $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed - $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + if ($self->{normalize}) { + $text =~ s/\xc2\xa0/ /g; # no-break space => space + $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space + $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space + } + else { + $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + } $text =~ tr/\f/\n/; # form feeds => newline my @textary = split_into_array_of_short_lines ($text); @@ -1041,7 +1059,14 @@ # whitespace handling (warning: small changes have large effects!) $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed - $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + if ($self->{normalize}) { + $text =~ s/\xc2\xa0/ /g; # no-break space => space + $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space + $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space + } + else { + $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space + } $text =~ tr/\f/\n/; # form feeds => newline my @textary = split_into_array_of_short_lines ($text); @@ -1094,6 +1119,28 @@ # --------------------------------------------------------------------------- +sub get_language { + my ($self) = @_; + + if (defined $self->{language}) { return $self->{language}; } + my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); + return '' unless @parts; + + # Go through each part + my @langs; + for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { + my $p = $parts[$pt]; + my $lang = $p->{language}; + next unless ($lang); + push(@langs, $lang) unless (grep(/^$lang$/, @langs)) + } + $self->{language} = scalar(@langs) ? join(' ', @langs) : ''; + return $self->{language}; +} + +# --------------------------------------------------------------------------- + + 1; =back