#!/usr/bin/perl
use strict;
use warnings;

use Encode;
use Mail::SpamAssassin;

# オブジェクト
my $sa;

# パラメータ、他
my $uniq_body = {};
my $uniq_subj = {};
my $num = 0;

init();
my $opts = getopt();
my $dirs = $opts->{dirs};
my $recursive = $opts->{recursive};
if (scalar @$dirs == 0) {
    usage();
    exit 1;
}
foreach my $dir (@$dirs) {
    if (-d $dir) {
        parse_dir($dir);
    }
    elsif (-f $dir) {
        my $message = read_message($dir);
        parse_msg($message);
    }
}

my $num_uniq_body = scalar keys %$uniq_body;
print STDERR "The number of messages     : $num\n";
print STDERR "The number of uniq_body messages: $num_uniq_body\n";

exit;

# 使い方
sub usage {
    print STDERR <<EOT;
Usage: ./sa-tokenizer.pl -r DIR  ...
          -r    Recursive directory search
          DIR   The directory including message files.
EOT
}

# オブジェクトの初期化
sub init {
    $sa = Mail::SpamAssassin->new({
              local_tests_only => 1,
              dont_copy_prefs  => 1,
          });
    $sa->init(1);
}

# オプションの取得
sub getopt {
    my $opts = {};
    my @dirs;
    while (my $opt = shift @ARGV) {
        if ($opt eq '-r') {
            $opts->{recursive} = 1;
        }
        else {
            push @dirs, $opt;
        }
    }
    $opts->{dirs} = \@dirs;
    return $opts;
}


sub parse_dir {
    my $dir = shift;

    opendir(DIR, $dir) or die "Unable to open $dir: $!";
    my @files = grep { ! /^\./ } readdir(DIR);
    closedir(DIR);

    print STDERR "Searching messages from $dir\n";

    foreach my $file (@files) {
        $file = "$dir/$file";
        if (-d $file and $recursive) {
            parse_dir($file);
        }
        elsif (-f $file) {
            my $message = read_message($file);
            parse_msg($message);
        }
    }

}

sub read_message {
    my $filename = shift;
    open(FILE, "< $filename") or die "Unable to open $filename: $!";
    my @messages;
    push @messages, $_ while (<FILE>);
    my $message = join '', @messages;
    close(FILE);
    return \$message;
}

# メッセージを解析して、UTF-8に変換済みで分かち書きが行われたデータを返す。
sub parse_msg {
    my $message = shift;

    my $msg     = $sa->parse($$message, 1);
    my $msgdata = $sa->{bayes_scanner}->get_body_from_msg($msg);
    my $body    = $msgdata->{bayes_token_body};

    my $subj = shift @$body;
    if (!exists $uniq_subj->{$subj}) {
        $uniq_subj->{$subj} = 1;
        $subj = encode_utf8($subj) if (utf8::is_utf8($subj));
        foreach my $token (split /\s+/, $subj) {
            print $token,"\n";
        }
        print "\n";
    }

    my $key = join '', @$body;
    if (!exists $uniq_body->{$key}) {
        $uniq_body->{$key} = 1;
        foreach my $line (@$body) {
            $line = encode_utf8($line) if (utf8::is_utf8($line));
            foreach my $token (split /\s+/, $line) {
                print $token,"\n";
            }
        }
        print "\n";
    }
    $num++;
    $msg->finish();
}

# vim: set ts=4 sw=4 et :


