/mythes-en-3.0/wn2ooo/th_gen_idx.pl
# · Perl · 61 lines · 39 code · 8 blank · 14 comment · 2 complexity · 2a6e9cdca26a3db77dc87f81ee0cf4fc MD5 · raw file
- #!/usr/bin/perl
- # Taken from Kevin B. Hendricks' MyThes, see LICENSE_th_gen_idx.txt
- # perl program to take a thesaurus structured text data file
- # and create the proper sorted index file (.idx)
- #
- # typcially invoked as follows:
- # cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
- #
- sub by_entry {
- my ($aent, $aoff) = split('\|',$a);
- my ($bent, $boff) = split('\|',$b);
- $aent cmp $bent;
- }
- # main routine
- my $ne = 0; # number of entries in index
- my @tindex=(); # the index itself
- my $foffset = 0; # file position offset into thesaurus
- my $rec=""; # current string and related pieces
- my $rl=0; # misc string length
- my $entry=""; # current word being processed
- my $nm=0; # number of meaning for the current word
- my $meaning=""; # current meaning and synonyms
- my $p; # misc uses
- my $encoding; # encoding used by text file
- # top line of thesaurus provides encoding
- $encoding=<STDIN>;
- $foffset = $foffset + length($encoding);
- chomp($encoding);
-
- # read thesaurus line by line
- # first line of every block is an entry and meaning count
- while ($rec=<STDIN>){
- $rl = length($rec);
- chomp($rec);
- ($entry, $nm) = split('\|',$rec);
- $p = 0;
- while ($p < $nm) {
- $meaning=<STDIN>;
- $rl = $rl + length($meaning);
- chomp($meaning);
- $p++;
- }
- push(@tindex,"$entry|$foffset");
- $ne++;
- $foffset = $foffset + $rl;
- }
- # now we have all of the information
- # so sort it and then output the encoding, count and index data
- @tindex = sort by_entry @tindex;
- print STDOUT "$encoding\n";
- print STDOUT "$ne\n";
- foreach $one (@tindex) {
- print STDOUT "$one\n";
- }