#!/usr/bin/perl
use strict;
use warnings;

use Encode;
use Text::MeCab;
use Lingua::JA::Hepburn::Passport;

############################################################################
#### 設定パラメータ ########################################################
############################################################################
# 生成するテストの数
my $max_tests    = 200;

# 生成するスコアの最大値
my $max_score    = 0.6;

# 生成するスコアの最小値
my $min_score    = 0.2;

# 生成するテスト名の最大文字数
my $max_len_name = 31;

# hamメールの重み付け
my $ham_ampli_rate = 10;

############################################################################
# ひらがな、カタカナ、漢字、全角英数字
my $JIS_RE = qr{(?:
    # Hiragana
      \xE3\x81[\x80-\xBF]
    | \xE3\x82[\x80-\x9F]
    # Katakana
    | \xE3\x82[\xA0-\xBF]
    | \xE3\x83[\x80-\xBF]
    # Kanji
    | \xE3[\x90-\xBF][\x80-\xBF]
    | [\xE4-\xE9][\x80-\xBF]{2}
    | \xEF[\xA4-\xAB][\x80-\xBF]
    # Fullwidth
    | \xEF\xBC[\x90-\x99]
    | \xEF\xBC[\xA1-\xBA]
    | \xEF\xBD[\x81-\x9A]
)+}x;

# ひらがな
my $HIRAGANA_RE = qr{(?:
      \xE3\x81[\x80-\xBF]
    | \xE3\x82[\x80-\x9F]
)+}x;

# カタカナ(30A1-30F6,30FC)
my $KATAKANA_RE = qr{(?:
      \xE3\x82[\xA1-\xBF]
    | \xE3\x83[\x80-\xB6]
    | \xE3\x83\xbc
)+}x;


# 全角英数字
my $FULLWIDTH_RE = qr{(?:
      \xEF\xBC[\x90-\x99]
    | \xEF\xBC[\xA1-\xBA]
    | \xEF\xBD[\x81-\x9A]
)+}x;

# 無効にするトークン
my $IGNORE_RE = qr{(?:
    # ひらがな
    (?:
          \xE3\x81[\x80-\xBF]
        | \xE3\x82[\x80-\x9F]
    )+

    |

    # カタカナ２文字以内
    (?:
          \xE3\x82[\xA0-\xBF]
        | \xE3\x83[\x80-\xBF]
    ){1,2}

    |

    # 漢字１文字＋ひらがな１文字
    (?:
          \xE3[\x90-\xBF][\x80-\xBF]
        | [\xE4-\xE9][\x80-\xBF]{2}
        | \xEF[\xA4-\xAB][\x80-\xBF]
    )
    (?:
          \xE3\x81[\x80-\xBF]
        | \xE3\x82[\x80-\x9F]
    )

    |

    # 全角数字
    (?:
        \xEF\xBC[\x90-\x99]
    )+
)}x;



# オブジェクト
my $mecab;
my $romanize;

# パラメータ、他
my $tokens_spam = {};
my $tokens_ham  = {};
my $ratios      = {};
my $romaji2token = {};
my $token2romaji = {};
my $num      = 0;
my $num_spam = 0;
my $num_ham  = 0;
my $max_suffix = 9;
my $similer_token_rate = 1.1;

init();
my $opts = getopt();
if (exists $opts->{spamfile}) {
    $num = 0;
    parse_token_counts($tokens_spam, $opts->{spamfile});
    $num_spam = $num;
}
else {
    usage();
    exit 1;
}

if (exists $opts->{hamfile}) {
    $num = 0;
    parse_token_counts($tokens_ham, $opts->{hamfile});
    $num_ham = $num;
}

my $num_uniq = scalar(keys %$tokens_spam);
print STDERR "The spam words:\n";
print STDERR "  The number of Japanese words    : " . $num_spam . "\n";
print STDERR "  The number of uniq words        : " . $num_uniq . "\n";

my $num_ham_uniq = scalar(keys %$tokens_ham);
print STDERR "The ham words:\n";
print STDERR "  The number of Japanese words    : " . $num_ham . "\n";
print STDERR "  The number of uniq words        : " . $num_ham_uniq . "\n";

calculate_ratio($tokens_spam, $tokens_ham, $ratios);
eliminate($tokens_spam, $ratios);
my $num_uniq2 = scalar(keys %$tokens_spam);
my $num_removed = $num_uniq - $num_uniq2;
print STDERR "The number of removed words       : " . $num_removed . "\n";
print STDERR "The number of remaining uniq words: " . $num_uniq2 . "\n";

my $num_tests = write_rules($tokens_spam, $tokens_ham, $ratios, $max_tests);
print STDERR "The number of made tests          : " . $num_tests . "\n";
 
exit;

# 使い方
sub usage {
    print STDERR "Usage: ./sa-ja-testmaker.pl -s SPAMFILE [-h HAMFILE]\n";
}

# オプションの取得
sub getopt {
    my $opts = {};
    while (my $opt = shift @ARGV) {
        if ($opt eq '-s') {
            $opt = shift @ARGV;
            if (defined $opt and $opt ne '') {
                $opts->{spamfile} = $opt;
            }
        }
        elsif ($opt eq '-h') {
            $opt = shift @ARGV;
            if (defined $opt and $opt ne '') {
                $opts->{hamfile} = $opt;
            }
        }
    }
    return $opts;
}

# オブジェクトの初期化
sub init {
    $mecab = Text::MeCab->new({
                 output_format_type => "user",
                 node_format        => '%f[7]',
                 bos_format         => '',
                 eos_format         => '',
             });

    $romanize = Lingua::JA::Hepburn::Passport->new;
}

# トークンの出現回数を調べる。
sub parse_token_counts {
    my $tokens = shift;
    my $filename = shift;

    my $old_token1 = '';
    my $old_token2 = '';
    my $token = '';

    open(TOKENS, "< $filename")
        or die "Unable to read $filename: $!";
    while (<TOKENS>) {
        chomp();
        foreach (split /\s+/) {
            if (!/^$JIS_RE$/o) {
                $old_token1 = '';
                $old_token2 = '';
                next;
            }
    
            my $token = $_;
            add_token($tokens, $token);

            if ($old_token1 ne '' && !is_hiragana(\$token)) { 
                if (!is_hiragana(\$old_token1)) {
                    add_token($tokens, $old_token1, $token);
                }
                if ($old_token2 ne '' && !is_hiragana(\$old_token2)) {
                    add_token($tokens, $old_token2, $old_token1, $token);
                }
            }
            $old_token2 = $old_token1;
            $old_token1 = $token;
            $num++;
        }
    }
    close(TOKENS);
}

# トークンをカウントする。
#   次のトークンを無効にする。
#       - ローマ字への変換ができない
#       - 一文字
#       - 平仮名だけ
#       - 片仮名二文字
#       - 漢字一文字＋平仮名一文字
#       - 全角数字だけ
#   ローマ字への変換が重複する場合は接尾辞として数字を付ける。
sub add_token {
    my $tokens = shift;
    my @token_array = @_;
    my $token = join '', @token_array;

    my $count = 0;
    if (!exists $tokens->{$token}) {
        my $romaji = create_romaji($token, \@token_array);

        if (!$romaji) {
            $tokens->{$token} = {};
            $tokens->{$token}->{count} = 0;
            $tokens->{$token}->{disabled} = 1;
            return;
        }
        if (exists $romaji2token->{$romaji}) {
            if ($romaji2token->{$romaji} ne $token) {
                for (my $i = 2; $i <= $max_suffix; $i++) {
                    my $buf = join '_', $romaji, $i;
                    if (!exists $romaji2token->{$buf}) {
                        $romaji = $buf;
                        last;
                    }
                }
                $token2romaji->{$token} = $romaji;
                $romaji2token->{$romaji} = $token;
            }
        }
        else {
            $token2romaji->{$token} = $romaji;
            $romaji2token->{$romaji} = $token;
        }
        $tokens->{$token} = {};
        if (is_ignored(\$token)) {
            $tokens->{$token}->{count} = 0;
            $tokens->{$token}->{disabled} = 1;
            return;
        }
        if (scalar(@token_array) > 1) {
            $tokens->{$token}->{related} = \@token_array;
        }
    }
    else {
        return if ($tokens->{$token}->{disabled});
        $count = $tokens->{$token}->{count};
    }
    $count++;
    $tokens->{$token}->{count} = $count;
}

# ローマ字を作成する。
sub create_romaji {
    my $token = shift;
    my $token_array_ref = shift;

    my $romaji = romanize_token($token);
    $romaji = undef if ($romaji and $romaji =~ /_/);
    if (!$romaji and scalar(@$token_array_ref) > 1) {
        my @romans;
        foreach my $t (@$token_array_ref) {
            my $roman;
            if (exists $token2romaji->{$t}) {
                $roman = $token2romaji->{$t};
                return if (!$roman);
                $roman =~ s/_\d$//;
            }
            else {
                return;
            }
            push @romans, $roman if ($roman);
        }
        my $romaji_joined = scalar(@romans) > 0 ? join('', @romans) : '';
        if ($romaji_joined !~ /_/) {
            $romaji = $romaji_joined;
        }
    }
    return $romaji;
}

# spamの出現頻度をhamにより補正する。
sub calculate_ratio {
    my $tokens_spam = shift;
    my $tokens_ham  = shift;
    my $ratios = shift;

    if ($num_ham > 0) {    
        foreach my $token (keys %$tokens_spam) {
            my $ratio_spam = $tokens_spam->{$token}->{count} / $num_spam;
            my $ratio_ham  = 0;
            if (exists $tokens_ham->{$token}) {
                $ratio_ham  = $tokens_ham->{$token}->{count} / $num_ham * $ham_ampli_rate;
            }
            $ratios->{$token} = int(($ratio_spam - $ratio_ham) * 100000) / 100000;
        }
    }
    else {
        foreach my $token (keys %$tokens_spam) {
            my $ratio_spam = $tokens_spam->{$token}->{count} / $num_spam;
            $ratios->{$token} = int($ratio_spam * 100000) / 100000;
        }
    }
}
    
# 余計なトークンを削除する。
sub eliminate {
    my $tokens = shift;
    my $ratios = shift;

    # トークンの出現回数の下限値を平均出現回数とする。
    my $lower_limit = int($num / scalar(keys %$tokens));

    # 似たようなトークンを無効にする。
    foreach my $token (keys %$tokens) {
        my $count = $tokens->{$token}->{count};
        if (exists $tokens->{$token}->{related}) {
            my $related = $tokens->{$token}->{related};
            foreach my $r (@$related) {
                next unless (exists $tokens->{$r});
                if ($count * $similer_token_rate > $tokens->{$r}->{count}) {
                    $tokens->{$r}->{disabled} = 1;
                }
            }
            my $num_related = scalar(@$related);
            if ($num_related > 2) {
                for (my $i = 0; $i < $num_related - 1; $i++) {
                    if (exists $related->[$i] and exists $related->[$i + 1]) {
                        my $r = $related->[$i] . $related->[$i + 1];
                        next unless (exists $tokens->{$r});
                        if ($count * $similer_token_rate > $tokens->{$r}->{count}) {
                            $tokens->{$r}->{disabled} = 1;
                        }
                    }
                }
            }
        }
    }

    # 出現頻度の低いトークンおよび無効なトークンを削除する。
    foreach my $token (keys %$tokens) {
        if (
            exists($tokens->{$token}->{disabled})
            or ($tokens->{$token}->{count} < $lower_limit)
            or ($ratios->{$token} < 0)
        ) {
            delete $tokens->{$token}->{count};
            delete $tokens->{$token}->{related};
            delete $tokens->{$token}->{disabled};
            delete $tokens->{$token};
            delete $ratios->{$token};
        }
    }
}

# ルールを出力する。
#     特定のトークンのスコアが高すぎて相対的に全体のスコアが小さくなるのを
#     防ぐために上位から5%の位置のトークンを最大スコアとする。
sub write_rules {
    my $tokens_spam = shift;
    my $tokens_ham  = shift;
    my $ratios = shift;
    my $max_tests = shift;
    my $max_score_percent = 5;

    my @sorted_tokens = sort { $ratios->{$b} <=> $ratios->{$a} } keys %$ratios;
    my $num_tests = scalar(@sorted_tokens);
    $num_tests = $num_tests > $max_tests ? $max_tests : $num_tests;
    my $max_idx = int($num_tests * $max_score_percent / 100);
    my $max_ratio = $ratios->{$sorted_tokens[$max_idx]};
    my $min_ratio = $ratios->{$sorted_tokens[$num_tests - 1]};

    my $diff_score = $max_score - $min_score;
    my $diff_ratio = $max_ratio - $min_ratio;
    for (1..$num_tests) {
        my $token = shift @sorted_tokens;
        my $ratio = $ratios->{$token};
        my $score = int(
                        (
                           $diff_score * ($ratio - $min_ratio) / $diff_ratio
                           + $min_score
                        )
                        * 10
                    )
                    / 10
                    ;
        $score = ($score > $max_score) ? $max_score : $score;
        my $count_spam = $tokens_spam->{$token}->{count} || 0;
        my $count_ham  = $tokens_ham->{$token}->{count}  || 0;
        my $ratio_str = "spam=${count_spam}/${num_spam}, ham=${count_ham}/${num_ham}, ratio=${ratio}";
        my $romaji = $token2romaji->{$token};
        $romaji = adjust_romaji_suffix($tokens_spam, $ratios, $token, $ratio, $romaji);
        my $rulename = 'BODY_JA_' . $romaji;
        print_rule($token, $ratio_str, $rulename, $token, $romaji, $score);
    }
    return $num_tests;
}

# ローマ字の重複を防ぐためのサフィックスの調整
sub adjust_romaji_suffix {
    my $tokens = shift;
    my $ratios = shift;
    my $token  = shift;
    my $ratio  = shift;
    my $romaji = shift;

    if ($romaji =~ /^(.*)(_\d)$/) {
        my $romaji_tmp = $1;
        my $suffix = $2;
        if (exists $romaji2token->{$romaji_tmp}) {
            my $token_tmp = $romaji2token->{$romaji_tmp};
            if (exists $tokens->{$token_tmp}) {
                if ($ratio > $ratios->{$token_tmp}) {
                    $romaji2token->{$romaji}     = $token_tmp;
                    $romaji2token->{$romaji_tmp} = $token;
                    $token2romaji->{$token}      = $romaji_tmp;
                    $token2romaji->{$token_tmp}  = $romaji;
                    $romaji = $romaji_tmp;
                }
            }
            else {
                $romaji2token->{$romaji}     = $token_tmp;
                $romaji2token->{$romaji_tmp} = $token;
                $token2romaji->{$token}      = $romaji_tmp;
                $romaji = $romaji_tmp;
            }
        }
    }
    return $romaji;
}

# トークンをローマ字に変換する。
# 変換できない文字は"_"に変換される。
sub romanize_token {
    my $token = shift;

    my $roman = '';

    # トークンがカタカナだけの場合はそのままローマ字に変換する。
    # トークンが全角英数字だけの場合は半角英数字に変換する。
    if ($token =~ /^$KATAKANA_RE$/o) {
        $token = decode_utf8($token);
        $roman = $romanize->romanize($token);
        $roman =~ s/[^A-Za-z]//g;
        return $roman;
    }
    elsif ($token =~ /^$FULLWIDTH_RE$/o) {
        return to_halfwidth_alpha($token);
    }

    # トークンがカタカナ以外の文字を含む場合はMeCabを使ってカタカナに変換する。
    # それからローマ字に変換する。
    my @roman_array;
    for (my $node = $mecab->parse($token); $node; $node = $node->next) {
        my $kana = $node->format($mecab);
        if ($kana =~ /^$KATAKANA_RE$/o) {
            $kana = decode_utf8($kana);
            $roman = $romanize->romanize($kana);
            $roman =~ s/[^A-Za-z]//g;
        }
        elsif (!$kana or $kana eq '') {
            next;
        }
        else {
            $roman = '_';
        }
        push @roman_array, $roman;
    }
    my $str = join '', @roman_array;
    $str = uc($str);
    return $str;
}

sub print_rule {
    my $token   = shift;
    my $ratio   = shift;
    my $name    = shift;
    my $pattern = shift;
    my $kana    = shift;
    my $score   = shift;


    my $len_name = length $name;
    my $num_tab = int(($max_len_name - $len_name - 1) / 8);
    $num_tab = $num_tab < 0 ? 0 : $num_tab;
    my $tabs;
    foreach (0..$num_tab) {
        $tabs .= "\t";
    }

print <<EOT;
# $name: $token $ratio
body     $name$tabs/$pattern/
describe $name$tabs$kana
score    $name$tabs$score

EOT
}

# トークンがひらがなだけであるかどうかを判断する。
sub is_hiragana {
    my $token_ref = shift;

    return ($$token_ref =~ /^$HIRAGANA_RE$/o) ? 1 : 0;
}

# トークンを無効にするかを判断する。
sub is_ignored {
    my $token_ref = shift;
    return ($$token_ref =~ /^$IGNORE_RE$/o or length($$token_ref) <= 3) ? 1 : 0;
}

# 全角英数字を半角英数字に変換する。
sub to_halfwidth_alpha {
    my $str = shift;

    my @dstr;
    use utf8;
    $str = decode_utf8($str);
    foreach my $ch (split '', $str) {
        my $c = ord $ch;
        if ($c > 0xFF00 && $c < 0xFF5F) {
            $c = $c - 0xFF00 + 0x20;
            $ch = chr($c);
        }
        push @dstr, $ch;
    }
    return join '', @dstr;
}

# vim: set ts=4 sw=4 et :


