diff -uNr Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/Util/Charset.pm Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/Util/Charset.pm --- Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/Util/Charset.pm 1970-01-01 09:00:00.000000000 +0900 +++ Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/Util/Charset.pm 2008-01-21 12:30:40.000000000 +0900 @@ -0,0 +1,432 @@ +# <@LICENSE> +# Copyright 2006 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +=head1 NAME + + Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language + +=head1 SYNOPSIS + + my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset); + my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset); + +=head1 DESCRIPTION + +This module implements utility methods for charset and language. + +=cut + +package Mail::SpamAssassin::Util::Charset; + +use strict; +use warnings; + +use vars qw ( + @ISA @EXPORT +); + +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(normalize_charset get_language); + +########################################################################### + +use constant HAS_ENCODE => eval { require Encode; require Encode::Guess; require Encode::Alias; }; +use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; +use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; +use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; }; + +########################################################################### + +our $KANA_HAN_RE = qr{ + # Hiragana and Katakana + \xE3[\x81-\x83][\x80-\xBF] + # Han + | \xE3[\x90-\xBF][\x80-\xBF] + | [\xE4-\xE9][\x80-\xBF]{2} + | \xEF[\xA4-\xAB][\x80-\xBF] +}x; + +our %enc2lang; +our %lang2enc; +our %scr2lang; +our %cjkscr2lang; +our @scrorder; + +BEGIN { + + # See the following URL about this map: + # http://czyborra.com/charsets/iso8859.html + # http://czyborra.com/charsets/codepages.html + # http://czyborra.com/charsets/cyrillic.html + # http://en.wikipedia.org/wiki/ISO_8859 + # http://www.w3.org/International/O-charset-lang.html + %enc2lang = ( + # buint-in Encodings and Encode::Byte + # N. America + 'ascii' => 'en', + 'cp437' => 'en', + 'cp863' => 'weurope', + + # W. Europe (Latin1, Latin9) + # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af + 'iso-8859-1' => 'weurope', + 'iso-8859-15' => 'weurope', + 'cp850' => 'weurope', + 'cp860' => 'weurope', + 'cp1252' => 'weurope', + 'MacRoman' => 'weurope', + + # Cntrl. Europe / Latin2 / Latin10 + # hr cs hu pl sr sk sl + 'iso-8859-2' => 'ceurope', + 'cp852' => 'ceurope', + 'cp1250' => 'ceurope', + 'MacCentralEurRoman' => 'ceurope', + 'MacCroatian' => 'ceurope', + 'iso-8859-16' => 'ceurope', + 'MacRomanian' => 'ceurope', + + # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) + # eo mt + 'iso-8859-3' => 'seurope', + + # Baltics (Latin4, Latin7) + # lv lt + 'iso-8859-4' => 'neurope', + 'iso-8859-13' => 'baltic', + 'cp1257' => 'baltic', + + # Nordics (Latin6) + # et kl iu se + 'iso-8859-10' => 'nordic', + + # Cyrillics + # bg be uk sr mk ru + 'iso-8859-5' => 'ru', + 'cp855' => 'ru', + 'cp1251' => 'ru', + 'cp866' => 'ru', + 'MacCyrillic' => 'ru', + 'koi8-r' => 'ru', + 'MacUkrainian' => 'uk', + 'koi8-u' => 'uk', + + # Arabic + 'iso-8859-6' => 'ar', + 'cp864' => 'ar', + 'cp1256' => 'ar', + 'MacArabic' => 'ar', + 'cp1006' => 'fa', + 'MacFarsi' => 'fa', + + # Greek + 'iso-8859-7' => 'el', + 'cp1253' => 'el', + 'MacGreek' => 'el', + + # Hebrew + # he yi + 'iso-8859-8' => 'he', + 'cp862' => 'he', + 'cp1255' => 'he', + 'MacHebrew' => 'he', + + # Turkish + 'iso-8859-9' => 'tr', + 'cp857' => 'tr', + 'cp1254' => 'tr', + 'MacTurkish' => 'tr', + + # Thai + 'iso-8859-11' => 'th', + 'cp874' => 'th', + + # Celtics (Latin8) + # gd cy br + 'iso-8859-14' => 'celtic', + + # Vietnamese + 'viscii' => 'vi', + 'cp1258' => 'vi', + + # Encode::CN + 'euc-cn' => 'zh', + 'cp936' => 'zh', + 'hz' => 'zh', + + # Encode::TW + 'big5-eten' => 'zh', + 'big5-hkscs' => 'zh', + 'cp950' => 'zh', + + # Encode::JP + 'euc-jp' => 'ja', + 'shiftjis' => 'ja', + '7bit-jis' => 'ja', + 'iso-2022-jp' => 'ja', + 'iso-2022-jp-1' => 'ja', + 'cp932' => 'ja', + + # Encode::KR + 'euc-kr' => 'ko', + 'cp949' => 'ko', + 'johab' => 'ko', + 'iso-2022-kr' => 'ko', + + # Encode::HanExtra + 'euc-tw' => 'zh', + 'gb18030' => 'zh', + + # Encode::JIS2K + 'euc-jisx0213' => 'ja', + 'shiftjisx0123' => 'ja', + 'iso-2022-jp-3' => 'ja', + + # Encode::EUCJPMS + 'eucJP-ms' => 'ja', + 'cp51932' => 'ja', + 'cp50220' => 'ja', + 'cp50221' => 'ja', + + ); + + %lang2enc = ( + # Latin1 + 'en' => ['ascii'], + 'weurope' => ['cp1252'], + + # Latin2 + 'ceurope' => ['cp1250'], + + # Latin3 + 'seurope' => ['iso-8859-3'], + + # Latin4 + 'neurope' => ['iso-8859-4'], + + # Latin5 + 'tr' => ['cp1254'], + + # Latin6 + 'nordic' => ['iso-8859-10'], + + # Latin7 + 'baltic' => ['cp1257'], + + # Latin8 + 'celtic' => ['iso-8859-14'], + + # Non Latin + 'ru' => ['koi8-r', 'cp1251'], + 'uk' => ['koi8-u'], + + 'ar' => ['cp1256'], + 'el' => ['cp1253'], + 'he' => ['cp1255'], + 'th' => ['cp874'], + 'vi' => ['viscii', 'cp1258'], + 'zh' => ['euc-cn', 'cp950'], + 'ja' => ['euc-jp', 'cp932'], + 'ko' => ['euc-kr', 'cp949'], + + ); + + %scr2lang = ( + 'InLatin1Supplement' => ['weurope'], + 'InLatinExtendedA' => [ + 'ceurope', + 'seurope', + 'tr', + 'vi' + ], + 'InLatinExtendedB' => [ + 'nordic', + 'baltic', + 'celtic' + ], + 'Thai' => ['th'], + 'Cyrillic' => ['ru', 'uk'], + 'Arabic' => ['ar'], + 'Greek' => ['el'], + 'Hebrew' => ['he'], + ); + + # better detection for CJK + @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); + %cjkscr2lang = ( + 'Hiragana' => ['ja'], + 'Katakana' => ['ja'], + 'Hangul' => ['ko'], + 'Han' => ['zh', 'ja', 'ko'], + ); + + if (HAS_ENCODE) { + unless (HAS_ENCODE_HANEXTRA) { + Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); + } + Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); + Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); + Encode::Alias::define_alias( qr/^Shift_JIS$/ => ' "cp932"' ); + if (HAS_ENCODE_EUCJPMS) { + Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' ); + } + } +} + +sub get_language { + my $str = shift; # $str must be UTF-8 encoding + my $charset = shift; + + return 'en' unless HAS_ENCODE; + return 'en' unless $charset; + if ($charset !~ /^utf/i) { + return $enc2lang{$charset}; + } elsif (defined($str)) { + $str =~ s/[\x00-\x7F]//g; # remove ASCII characters + return 'en' if ($str eq ''); + + my %handled; + $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); + foreach my $scr (@scrorder) { + next if ($str !~ /\p{$scr}/); + my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; + foreach my $lang (@$scrlangs) { + next if (exists($handled{$lang})); + foreach my $enc (@{$lang2enc{$lang}}) { + my $scratch = $str; + Encode::encode($enc, $scratch, Encode::FB_QUIET); + return $lang if ($scratch eq ''); + } + $handled{$lang} = 1; + } + } + } + return 'en'; +} + +# TEST 1: try conversion to use the specified charset. +# TEST 2: try conversion to use Encode::Detect. +# TEST 3: try conversion to use Encode::Guess. +sub normalize_charset { + my $str = shift; + my $charset = shift; + + return wantarray ? ($str, $charset) : $str unless HAS_ENCODE; + return wantarray ? ($str, 'ascii') : $str unless ($str); + + my $decoded; + my $detected; + + if ($charset) { + ($decoded, $detected) = _specified_encoding($str, $charset); + } + unless ($detected) { + ($decoded, $detected) = _encode_detect($str); + } + unless ($detected) { + ($decoded, $detected) = _encode_guess($str, $charset); + } + unless ($detected) { + return ($str, undef); + } + $decoded = Encode::encode_utf8($decoded); + + # unfold hiragana, katakana and han + if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) { + $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og; + } + return wantarray ? ($decoded, $detected) : $decoded; +} + +sub _specified_encoding { + my $str = shift; + my $encoding = shift; + + my $detected; + my $decoded; + # note: ISO-2022-* is not deistinguish from US-ASCII + if ($encoding and ($str !~ /\e/ or $encoding =~ /^ISO-2022/i)) { + my $encoder = Encode::find_encoding($encoding); + if (ref($encoder)) { + $decoded = $encoder->decode($str,Encode::FB_QUIET); + $detected = $encoder->name if ($str eq ''); + } + } + return ($decoded, $detected); +} + +sub _encode_detect { + return undef unless HAS_ENCODE_DETECT; + my $str = shift; + + my $decoded; + my $detected = Encode::Detect::Detector::detect($str); + if ($detected) { + my $encoder = Encode::find_encoding($detected); + if (ref($encoder)) { + $decoded = $encoder->decode($str); + $detected = $decoded ? $encoder->name : undef; + } + else { + $detected = undef; + } + } + return ($decoded, $detected); +} + +sub _encode_guess { + my $str = shift; + my $encoding = shift; + + my $detected; + my $decoded; + my $encoder; + if ($encoding) { + $encoding = Encode::resolve_alias($encoding); + if ($encoding) { + $encoder = Encode::Guess::guess_encoding($str, $encoding); + } + } + unless (ref($encoder)) { + if ($str =~ /\e/) { + $encoder = Encode::Guess::guess_encoding($str, qw/7bit-jis iso-2022-kr/); + } + elsif ($str =~ /[\x80-\xFF]{4}/) { + $encoder = Encode::Guess::guess_encoding($str, + qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); + } + else { + $encoder = Encode::Guess::guess_encoding($str, qw/iso-8859-1 cp1252/); + } + } + if (ref($encoder)) { + $detected = $encoder->name; + if ($detected =~ /^UTF-(?:16|32)[BL]E$/i) { + # The guessed UTF-16|32 encoding without BOM cannot be trusted. + $detected = undef; + } + if ($detected) { + $decoded = $encoder->decode($str); + } + } + return ($decoded, $detected); +} + +1; +