PageRenderTime 41ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/mythes-en-3.0/wn2ooo/th_gen_idx.pl

#
Perl | 61 lines | 39 code | 8 blank | 14 comment | 2 complexity | 2a6e9cdca26a3db77dc87f81ee0cf4fc MD5 | raw file
Possible License(s): LGPL-2.0
  1. #!/usr/bin/perl
  2. # Taken from Kevin B. Hendricks' MyThes, see LICENSE_th_gen_idx.txt
  3. # perl program to take a thesaurus structured text data file
  4. # and create the proper sorted index file (.idx)
  5. #
  6. # typcially invoked as follows:
  7. # cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
  8. #
  9. sub by_entry {
  10. my ($aent, $aoff) = split('\|',$a);
  11. my ($bent, $boff) = split('\|',$b);
  12. $aent cmp $bent;
  13. }
  14. # main routine
  15. my $ne = 0; # number of entries in index
  16. my @tindex=(); # the index itself
  17. my $foffset = 0; # file position offset into thesaurus
  18. my $rec=""; # current string and related pieces
  19. my $rl=0; # misc string length
  20. my $entry=""; # current word being processed
  21. my $nm=0; # number of meaning for the current word
  22. my $meaning=""; # current meaning and synonyms
  23. my $p; # misc uses
  24. my $encoding; # encoding used by text file
  25. # top line of thesaurus provides encoding
  26. $encoding=<STDIN>;
  27. $foffset = $foffset + length($encoding);
  28. chomp($encoding);
  29. # read thesaurus line by line
  30. # first line of every block is an entry and meaning count
  31. while ($rec=<STDIN>){
  32. $rl = length($rec);
  33. chomp($rec);
  34. ($entry, $nm) = split('\|',$rec);
  35. $p = 0;
  36. while ($p < $nm) {
  37. $meaning=<STDIN>;
  38. $rl = $rl + length($meaning);
  39. chomp($meaning);
  40. $p++;
  41. }
  42. push(@tindex,"$entry|$foffset");
  43. $ne++;
  44. $foffset = $foffset + $rl;
  45. }
  46. # now we have all of the information
  47. # so sort it and then output the encoding, count and index data
  48. @tindex = sort by_entry @tindex;
  49. print STDOUT "$encoding\n";
  50. print STDOUT "$ne\n";
  51. foreach $one (@tindex) {
  52. print STDOUT "$one\n";
  53. }