blacklist_classifier

/Lingua-Identify-Blacklists/bin/blacklist_classifier

https://bitbucket.org/tiedemann/blacklist-classifier
Perl | 186 lines | 139 code | 39 blank | 8 comment | 10 complexity | 2a9242ae2404ec047cb6d7664fb44703 MD5 | raw file

#!/usr/bin/env perl
#-*-perl-*-


=encoding UTF-8

=head1 USAGE

=head2 Classification:

   blacklist_classifier [OPTIONS] lang1 lang2 ... < file

=head2 training:

   blacklist_classifier -n [OPTIONS] text1 text2 > blacklist.txt
   blacklist_classifier [OPTIONS] -t "t1.txt t2.txt ..." lang1 lang2 ...

=head2 run experiments:

   blacklist_classifier -t "t1.txt t2.txt ..." \
                           -e "e1.txt e2.txt ..." \
                           lang1 lang2 ...

=head2 command line arguments:

 lang1 lang2 ... are language ID's
 blacklists are expected in <BlackListDir>/<lang1-lang2.txt
 t1.txt t2.txt ... are training data files (in UTF-8)
 e1.txt e2.txt ... are training data files (in UTF-8)
 the order of languages needs to be the same for training data, eval data
   as given by the command line arguments (lang1 lang2 ..)


 -a <freq> ...... min freq for common words
 -b <freq> ...... max freq for uncommon words
 -c <score> ..... min difference score to be relevant
 -d <dir> ....... directory of black lists
 -i ............. classify each line separately
 -m <number> .... use approximately <number> tokens to train/classify
 -n ............. train a new black list
 -v ............. verbose mode

 -U ............. don't lowercase
 -S ............. don't tokenize (use the string as it is)
 -A ............. don't discard tokens with non-alphabetic characters

=cut

use strict;
use vars qw($opt_a $opt_b $opt_c $opt_m $opt_n $opt_d $opt_v $opt_i
            $opt_t $opt_e $opt_F $opt_T $opt_L $opt_U $opt_S $opt_A $opt_M);
use Getopt::Std;
use FindBin qw($Bin);

use lib "$Bin/../lib";
use Lingua::Identify::Blacklists qw/:all/;


getopts('a:b:c:d:im:nvt:e:F:T:L:USAM:');

binmode(STDIN,":encoding(UTF-8)");
binmode(STDOUT,":encoding(UTF-8)");
binmode(STDERR,":encoding(UTF-8)");

my $min_high = defined $opt_a ? $opt_a : 10;
my $max_low  = defined $opt_b ? $opt_b : 3;
my $min_diff = defined $opt_c ? $opt_c : 0.8;


$Lingua::Identify::Blacklists::VERBOSE      = 1 if ($opt_v);
$Lingua::Identify::Blacklists::BLACKLISTDIR = $opt_d if ($opt_d);

unless (-d $Lingua::Identify::Blacklists::BLACKLISTDIR){
    $Lingua::Identify::Blacklists::BLACKLISTDIR = "$Bin/../share/blacklists";
}

my %options = ( text_size => $opt_m,
		min_high => $min_high, 
		max_low  => $max_low, 
		min_diff => $min_diff );


if ($opt_v){
    print "use blacklists in $Lingua::Identify::Blacklists::BLACKLISTDIR\n";
}

# run experiments with a given set of training corpora and 
# a set of evaluation corpora

if ($opt_e){
    my @langs = @ARGV;
    # run an experiment with exponentially increasing training sizes
    # from $opt_F to $opt_T
    if ($opt_F && $opt_T && $opt_L){
        $options{text_size} = $opt_F;
        while ($options{text_size} < $opt_T){
            print "train with ca $opt_m tokens\n";
            &run_experiment($opt_t,$opt_e,\%options,@langs);
            $options{text_size} *= $opt_L;
        }
    }
    &run_experiment($opt_t,$opt_e,\%options,@langs);
    exit;
}


# train new black lists

if ($opt_t){
    my @traindata = split(/\s+/,$opt_t);
    my @langs = @ARGV;
    my %trainset = ();
    for (0..$#langs){ $trainset{$langs[$_]} = $traindata[$_]; }
    &train( \%trainset, %options );
}
elsif ($opt_n){
    my $file1=shift(@ARGV);
    my $file2=shift(@ARGV);
    &train_blacklist( $file1,$file2, %options );
}


# classify

else{
    my @langs = @ARGV;
    @ARGV = ();
    my @predictions = &identify_stdin( langs => \@langs, 
				       every_line => $opt_i,
				       %options );
    print join("\n",@predictions);
    print "\n";
}




=head1 AUTHOR

Jรถrg Tiedemann, L<https://bitbucket.org/tiedemann>

=head1 BUGS

Please report any bugs or feature requests to
L<https://bitbucket.org/tiedemann/blacklist-classifier>.  I will be notified,
and then you'll automatically be notified of progress on your bug as I
make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Lingua::Identify::Blacklists

=head1 LICENSE AND COPYRIGHT

Copyright 2012 Jรถrg Tiedemann.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with this program.  If not, see L<http://www.gnu.org/licenses/>.


THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

=cut