PageRenderTime 50ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/Lingua-Identify-Blacklists/bin/blacklist_classifier

https://bitbucket.org/tiedemann/blacklist-classifier
Perl | 186 lines | 139 code | 39 blank | 8 comment | 10 complexity | 2a9242ae2404ec047cb6d7664fb44703 MD5 | raw file
  1. #!/usr/bin/env perl
  2. #-*-perl-*-
  3. =encoding UTF-8
  4. =head1 USAGE
  5. =head2 Classification:
  6. blacklist_classifier [OPTIONS] lang1 lang2 ... < file
  7. =head2 training:
  8. blacklist_classifier -n [OPTIONS] text1 text2 > blacklist.txt
  9. blacklist_classifier [OPTIONS] -t "t1.txt t2.txt ..." lang1 lang2 ...
  10. =head2 run experiments:
  11. blacklist_classifier -t "t1.txt t2.txt ..." \
  12. -e "e1.txt e2.txt ..." \
  13. lang1 lang2 ...
  14. =head2 command line arguments:
  15. lang1 lang2 ... are language ID's
  16. blacklists are expected in <BlackListDir>/<lang1-lang2.txt
  17. t1.txt t2.txt ... are training data files (in UTF-8)
  18. e1.txt e2.txt ... are training data files (in UTF-8)
  19. the order of languages needs to be the same for training data, eval data
  20. as given by the command line arguments (lang1 lang2 ..)
  21. -a <freq> ...... min freq for common words
  22. -b <freq> ...... max freq for uncommon words
  23. -c <score> ..... min difference score to be relevant
  24. -d <dir> ....... directory of black lists
  25. -i ............. classify each line separately
  26. -m <number> .... use approximately <number> tokens to train/classify
  27. -n ............. train a new black list
  28. -v ............. verbose mode
  29. -U ............. don't lowercase
  30. -S ............. don't tokenize (use the string as it is)
  31. -A ............. don't discard tokens with non-alphabetic characters
  32. =cut
  33. use strict;
  34. use vars qw($opt_a $opt_b $opt_c $opt_m $opt_n $opt_d $opt_v $opt_i
  35. $opt_t $opt_e $opt_F $opt_T $opt_L $opt_U $opt_S $opt_A $opt_M);
  36. use Getopt::Std;
  37. use FindBin qw($Bin);
  38. use lib "$Bin/../lib";
  39. use Lingua::Identify::Blacklists qw/:all/;
  40. getopts('a:b:c:d:im:nvt:e:F:T:L:USAM:');
  41. binmode(STDIN,":encoding(UTF-8)");
  42. binmode(STDOUT,":encoding(UTF-8)");
  43. binmode(STDERR,":encoding(UTF-8)");
  44. my $min_high = defined $opt_a ? $opt_a : 10;
  45. my $max_low = defined $opt_b ? $opt_b : 3;
  46. my $min_diff = defined $opt_c ? $opt_c : 0.8;
  47. $Lingua::Identify::Blacklists::VERBOSE = 1 if ($opt_v);
  48. $Lingua::Identify::Blacklists::BLACKLISTDIR = $opt_d if ($opt_d);
  49. unless (-d $Lingua::Identify::Blacklists::BLACKLISTDIR){
  50. $Lingua::Identify::Blacklists::BLACKLISTDIR = "$Bin/../share/blacklists";
  51. }
  52. my %options = ( text_size => $opt_m,
  53. min_high => $min_high,
  54. max_low => $max_low,
  55. min_diff => $min_diff );
  56. if ($opt_v){
  57. print "use blacklists in $Lingua::Identify::Blacklists::BLACKLISTDIR\n";
  58. }
  59. # run experiments with a given set of training corpora and
  60. # a set of evaluation corpora
  61. if ($opt_e){
  62. my @langs = @ARGV;
  63. # run an experiment with exponentially increasing training sizes
  64. # from $opt_F to $opt_T
  65. if ($opt_F && $opt_T && $opt_L){
  66. $options{text_size} = $opt_F;
  67. while ($options{text_size} < $opt_T){
  68. print "train with ca $opt_m tokens\n";
  69. &run_experiment($opt_t,$opt_e,\%options,@langs);
  70. $options{text_size} *= $opt_L;
  71. }
  72. }
  73. &run_experiment($opt_t,$opt_e,\%options,@langs);
  74. exit;
  75. }
  76. # train new black lists
  77. if ($opt_t){
  78. my @traindata = split(/\s+/,$opt_t);
  79. my @langs = @ARGV;
  80. my %trainset = ();
  81. for (0..$#langs){ $trainset{$langs[$_]} = $traindata[$_]; }
  82. &train( \%trainset, %options );
  83. }
  84. elsif ($opt_n){
  85. my $file1=shift(@ARGV);
  86. my $file2=shift(@ARGV);
  87. &train_blacklist( $file1,$file2, %options );
  88. }
  89. # classify
  90. else{
  91. my @langs = @ARGV;
  92. @ARGV = ();
  93. my @predictions = &identify_stdin( langs => \@langs,
  94. every_line => $opt_i,
  95. %options );
  96. print join("\n",@predictions);
  97. print "\n";
  98. }
  99. =head1 AUTHOR
  100. Jรถrg Tiedemann, L<https://bitbucket.org/tiedemann>
  101. =head1 BUGS
  102. Please report any bugs or feature requests to
  103. L<https://bitbucket.org/tiedemann/blacklist-classifier>. I will be notified,
  104. and then you'll automatically be notified of progress on your bug as I
  105. make changes.
  106. =head1 SUPPORT
  107. You can find documentation for this module with the perldoc command.
  108. perldoc Lingua::Identify::Blacklists
  109. =head1 LICENSE AND COPYRIGHT
  110. Copyright 2012 Jรถrg Tiedemann.
  111. This program is free software: you can redistribute it and/or modify
  112. it under the terms of the GNU Lesser General Public License as published
  113. by the Free Software Foundation, either version 3 of the License, or
  114. (at your option) any later version.
  115. This program is distributed in the hope that it will be useful,
  116. but WITHOUT ANY WARRANTY; without even the implied warranty of
  117. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  118. GNU Lesser General Public License for more details.
  119. You should have received a copy of the GNU Lesser General Public License
  120. along with this program. If not, see L<http://www.gnu.org/licenses/>.
  121. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  122. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  123. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  124. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  125. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  126. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  127. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  128. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  129. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  130. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  131. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  132. =cut