PageRenderTime 64ms CodeModel.GetById 33ms RepoModel.GetById 1ms app.codeStats 0ms

/uplug-main/uplug

https://bitbucket.org/tiedemann/uplug
Perl | 457 lines | 327 code | 106 blank | 24 comment | 37 complexity | ebc87f24872d172c0afb6fefceb28726 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. #!/usr/bin/env perl
  2. # -*-perl-*-
  3. #
  4. #---------------------------------------------------------------------------
  5. # Copyright (C) 2004 Jörg Tiedemann
  6. #
  7. # This program is free software; you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation; either version 2 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program; if not, write to the Free Software
  19. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  20. #---------------------------------------------------------------------------
  21. =head1 NAME
  22. uplug - the main startup script for the Uplug toolbox
  23. =head1 SOURCES AND EXTENSIONS
  24. For the latest sources, language packs, additional modules and tools: Please, have a look at the project website at L<https://bitbucket.org/tiedemann/uplug>
  25. =head1 SYNOPSIS
  26. uplug [-ehHlp] [-f fallback] config-file [MODULE-ARGUMENTS]
  27. C<config-file> is a valid Uplug configuration file (describing a module that may consist of several sub-modules). Configuration files can be given with the absolute and relative paths. If they are not found as specified, then Uplug will look at C<UplugSharedDir/systems/>
  28. =head1 OPTIONS
  29. -e ............. returns the location of the given config-file
  30. -f fallback .... fallback modules (config-files separated by ':')
  31. -h ............. show a help text (also for specific config-files)
  32. -H ............. show the man page
  33. -l ............. list all modules (Uplug config files)
  34. -p ............. print the configuration file
  35. Other command-line options depend on the specifications in the configuration file. Each module may define its own arguments and options. For example, the basic pre-processing module accepts command-line arguments for input and output and for the input encoding:
  36. uplug pre/basic -in 1988en.txt -ci 'iso-8859-1' -out 1988en.xml
  37. This will take the generic C<basic> pre-processing module from found in C<UplugShareDir/systems/pre> and it will process the text in C<1988en.txt> (which is assumed to be in ISO-8859-1) and will produce 1988en.xml.
  38. =cut
  39. use strict;
  40. # make it possible to use local copies of Uplug without installing
  41. use FindBin qw($Bin);
  42. use lib "$Bin/lib";
  43. use Uplug;
  44. use Uplug::Config;
  45. use Getopt::Std;
  46. my %opts;
  47. my $known_opts = 'ef:hHlp';
  48. getopts ($known_opts, \%opts);
  49. &help_message if ($opts{H});
  50. &usage(@ARGV) if ($opts{h});
  51. &find_config(@ARGV) if ($opts{e});
  52. &list_modules(@ARGV) if ($opts{l});
  53. &print_config(@ARGV) if ($opts{p});
  54. # set some essential locations in the environment
  55. $ENV{UPLUGHOME} = $Bin;
  56. $ENV{UPLUGSHARE} = &shared_home();
  57. # check whether the module exists
  58. my $module = shift(@ARGV);
  59. unless (-e &FindConfig($module)){
  60. my @fallback = split(/:/,$opts{f});
  61. foreach (@fallback){
  62. if (-e &FindConfig($_)){
  63. $module = $_;
  64. last;
  65. }
  66. }
  67. }
  68. die "Cannot find the Uplug module '$module'!\n" unless (-e &FindConfig($module));
  69. # load and run
  70. my $uplug=Uplug->new($module,@ARGV); # create a new uplug module
  71. $uplug->load(); # load it
  72. $uplug->run(); # and run it
  73. sub usage
  74. {
  75. use Pod::Usage;
  76. if (@_){
  77. pod2usage(
  78. -exitval => 'NOEXIT',
  79. -message => 'uplug - the startup script for the Uplug toolbox',
  80. -verbose => 0,
  81. );
  82. &PrintConfigInfo(@_);
  83. exit 1;
  84. }
  85. pod2usage(
  86. -exitval => 'NOEXIT',
  87. -message => 'uplug - the startup script for the Uplug toolbox',
  88. -verbose => 1,
  89. );
  90. exit 1;
  91. }
  92. sub help_message
  93. {
  94. use Pod::Usage;
  95. pod2usage(
  96. -exitval => 'NOEXIT',
  97. -message => 'uplug - the startup script for the Uplug toolbox',
  98. -verbose => 2,
  99. );
  100. print STDERR $_[0] if @_;
  101. exit 1;
  102. }
  103. sub find_config{
  104. my $file = shift;
  105. unless ($file){
  106. print STDERR "Please give a Uplug configuration file!\n\n";
  107. &usage;
  108. }
  109. my $config = &FindConfig($file);
  110. if (-e $config){
  111. print $config,"\n";
  112. exit 1;
  113. }
  114. print STDERR "Cannot find configuration file '$file'!\n";
  115. exit 0;
  116. }
  117. sub print_config{
  118. my $file = shift;
  119. my $config = &ReadConfig($file);
  120. &WriteConfig(undef,$config);
  121. print STDERR $_[0] if @_;
  122. exit 1;
  123. }
  124. sub list_modules{
  125. &ListAvailableModules(@_);
  126. print STDERR $_[0] if @_;
  127. exit 1;
  128. }
  129. __END__
  130. =head1 DESCRIPTION
  131. The basic use of this startup script is to load a Uplug module, to parse its configuration and to run it using the command-line arguments give. Uplug modules may consist of complex processing pipelines and loops and Uplug tries to build system calls accordingly.
  132. You can check whether a specific module exists using the flag C<-e>. This will also return the location of the config-file if it can be found:
  133. uplug -e config-file
  134. You can list all available modules (i.e. Uplug configuration files) by running
  135. uplug -l
  136. You can also list only the modules within a specific sub-directory. For example, to list all configuration files for pre-processing English you can run
  137. uplug -l pre/en
  138. =head2 Uplug modules
  139. The main modules are structured in categories like this:
  140. pre/ ........ pre-processing (generic and language-specific ones)
  141. pre/xx ...... language-specific pre-processing modules (<xx> = langID)
  142. align ....... modules for alignment of parallel texts
  143. align/word .. modules for word alignment
  144. The most common modules are the following
  145. pre/basic ... basic pre-processing (includes 'markup', 'sent', 'tok')
  146. pre/markup .. basic markup (text to XML, paragraph boundaries)
  147. pre/sent .... a generic sentence boundary detector
  148. pre/tok ..... a generic tokenizer
  149. pre/xx-all .. bundle pre-processing for language <xx>
  150. pre/xx-tag .. tag untokenized XML text in language <xx>
  151. align/sent .. length-based sentence alignment
  152. align/hun ... wrapper around hunalign
  153. align/gma ... geometric mapping and alignment
  154. align/word/basic ..... basic word alignment (based on clues)
  155. align/word/default ... default settings for word alignment
  156. align/word/advanced .. advanced settings for word alignment
  157. If you install C<uplug-treetagger>, then you the following module is also quite useful:
  158. pre/xx/all-treetagger run pre-processing pipeline including TreeTagger
  159. To get more information about a specific module, run (for example for the module 'pre/basic')
  160. uplug -h pre/basic
  161. To print the configuration file on screen, use
  162. uplug -p pre/basic
  163. Sometimes it can be handy to define fallback modules in case you don't know exactly if a certain module exists. For example, you may want to use language-specific pre-processing pipelines but you like to fall back to the generic pre-processing steps when no language-specific configuration is found. Here is an example:
  164. uplug -f pre/basic pre/ar/basic -in inpout.txt -out output.txt
  165. This command tries to call C<pre/ar/basic> (Arabic pre-processing) but falls back to the generic C<pre/basic> if this module cannot be found. You can also give a sequence of fallback modules with the same flag. Separate each fallback module by ':'.
  166. =head2 Uplug module scripts
  167. Uplug modules usually call external scripts distributed by this package. There is a number of scritps for specific tasks. Here is a list of scripts (to be found in C<$Uplug::config::SHARED_BIN>):
  168. =over
  169. =item Pre-processing
  170. uplug-markup uplug-tok uplug-sent
  171. uplug-toktag uplug-tokext uplug-tag
  172. uplug-split uplug-chunk uplug-malt
  173. =item Sentence alignment
  174. uplug-sentalign uplug-hunalign uplug-gma
  175. =item Word alignment (and related tasks)
  176. uplug-coocfreq uplug-coocstat uplug-strsim
  177. uplug-ngramfreq uplug-ngramstat uplug-markphr
  178. uplug-giza uplug-linkclue uplug-wordalign
  179. =item Other
  180. uplug-convert
  181. =back
  182. =head1 Examples
  183. =head2 Prepare project directory
  184. Make a new project directory and go there:
  185. mkdir myproject
  186. cd myproject
  187. Copy example files into the project directory:
  188. cp /path/to/uplug/example/1988sv.txt .
  189. cp /path/to/uplug/example/1988en.txt .
  190. =head2 Basic pre-processing (text to xml)
  191. Convert texts in Swedish and English, encoded in ISO-8859-1 (latin1) and add some basic markup (paragraph boundaries, sentence boundaries and token boundaries).
  192. uplug pre/basic -ci 'iso-8859-1' -in 1988sv.txt > 1988sv.xml
  193. uplug pre/basic -ci 'iso-8859-1' -in 1988en.txt > 1988en.xml
  194. =head2 Sentence alignment
  195. Align the files from the previous step:
  196. uplug align/sent -src 1988sv.xml -trg 1988en.xml > 1988sven.xml
  197. Sentence alignment pointers are stored in C<1988sven.xml>.
  198. You can read the aligned bitext segments using the following command:
  199. uplug-readalign 1988sven.xml | less
  200. =head2 Word alignment (default mode)
  201. uplug align/word/default -in 1988sven.xml -out 1988sven.links
  202. This will take some time! Word alignment is slow even for this
  203. little bitext. The word aligner will
  204. * create basic clues (Dice and LCSR)
  205. * run GIZA++ with standard settings (trained on plain text)
  206. * learn clues from GIZA's Viterbi alignments
  207. * "radical stemming" (take only the 3 inital characters
  208. of each token) and run GIZA++ again
  209. * align words with existing clues
  210. * learn clues from previous alignment
  211. * align words again with all existing clues
  212. Word alignment results are stored in 1988sven.links.
  213. You may look at word type links using the following script:
  214. /path/to/uplug/tools/xces2dic < 1988sven.links | less
  215. =head2 Word alignment (tagged mode)
  216. Use the following command for aligning tagged corpora (at least POS tags):
  217. cp /path/to/uplug/example/svenprf* .
  218. uplug align/word/tagged -in svenprf.xces -out svenprf.links
  219. This is essentially the same as the default word alignment with additional
  220. clues for POS and chunk labels.
  221. =head2 Word alignment with Moses output format (using default mode)
  222. Use the following command if you like to get the word alignments
  223. in Moses format (links between word positions like in Moses after
  224. word alignment symmetrization)
  225. uplug align/word/default -in 1988sven.xml -out 1988sven.links -of moses
  226. The Parameter '-of' is used to set the output format. The same
  227. parameter is available for other word alignment settings like
  228. 'basic' and 'advanced'
  229. Note that you can easily convert your parallel corpus into Moses
  230. format as well. There are actually three options:
  231. uplug/tools/xces2text 1988sven.xml output.sv output.en
  232. uplug/tools/xces2moses -s sv -t en 1988sven.xml output
  233. uplug/tools/opus2moses.pl -d . -e output.sv -f output.en < 1988sven.xml
  234. uplug/tools/xces2plain 1988sven.xml output output sv en
  235. The three tools use different ways of extracting the text from the
  236. aligned XML files. Look at the code and the usage information about
  237. how they differ. The first option os probably the safest one as
  238. this uses the same Uplug modules for extracting the text as they
  239. are used for word alignemnt. The last one requires XML::DT and works
  240. even when sentences are not aligned monotonically.
  241. =head2 Tagging (using external taggers)
  242. There are several taggers that can be called from the Uplug
  243. scripts. The following command can be used to tag the English
  244. example corpus:
  245. uplug pre/en/tagGrok -in 1988en.xml > 1988en.tag
  246. =head2 Chunking (using external chunkers)
  247. There is a chunker for English that can be run on POS-tagged
  248. corpus files:
  249. uplug pre/en/chunk -in 1988en.tag > 1988en.chunk
  250. =head2 Word alignment evaluation
  251. Word alignment can be evaluated using a gold standard (reference
  252. links stored in another file using the same format as for the
  253. links produced by Uplug). There is a small gold standard for the
  254. example bitext used in 3f). Alignments produced above can be
  255. evaluated using the following command:
  256. uplug-evalalign -gold svenprf.gold -in svenprf.links | less
  257. Several measures will be computed by comparing reference links
  258. with links proposed by the system.
  259. =head2 Word alignment (using existing clues)
  260. 3c) and 3f) explained how to run the aligner with all its
  261. sub-processes. However, existing clues do not have to be computed
  262. each time. Existing clues can be re-used for further alignent
  263. runs. The user can specify the set of clues that should be used
  264. for aligning words. The following command runs the word aligner
  265. with one clue type (GIZA++ translation probabilities):
  266. uplug align/word/test/link -gw -in svenprf.xces -out links.new
  267. Weights can be set independently for each clue type. For example,
  268. in the example above we can specify a clue weight (e.g. 0.01) for
  269. GIZA++ clues using the following runtime parameter: '-gw_w 0.01'.
  270. Lots of different clues may be used depending on what has been
  271. computed before. The following table gives an overview of some
  272. available runtime clue-parameters.
  273. clue-flag weight-flag clue type
  274. ---------------------------------------------------------------------
  275. -sim -sim_w LCSR (string similarity)
  276. -dice -dice_w Dice coefficient
  277. -mi -mi_w point-wise Mututal Information
  278. -tscore -tscore_w t-scores
  279. -gw -gw_w GIZA++ trained on tokenised plain text
  280. -gp -gp_w GIZA++ trained on POS tags
  281. -gpw -gpw_w GIZA++ trained on words and POS tags
  282. -gwp -gwp_w GIZA++ trained on word-prefixes (3 character)
  283. -gws -gws_w GIZA++ trained on word-suffixes (3 character)
  284. -gwi -gwi_w GIZA++ inverse (same as -gw)
  285. -gpi -gpi_w GIZA++ inverse (same as -gp)
  286. -gpwi -gpwi_w GIZA++ inverse (same as -gpw)
  287. -gwpi -gwpi_w GIZA++ inverse (same as -gwp)
  288. -gwsi -gwsi_w GIZA++ inverse (same as -gws)
  289. -dl -dl_q dynamic clue (words)
  290. -dlp -dlp_w dynamic clue (words+POS)
  291. -dp3 -dp3_w dynamic clue (POS-trigram)
  292. -dcp3 -dcp3_w dynamic clue (chunklabel+POS-trigram)
  293. -dpx -dpx_w dynamic clue (POS+relative position)
  294. -dp3x -dp3x_w dynamic clue (POS trigram+relative position)
  295. -dc3 -dc3_w dynamic clue (chunk label trigram)
  296. -dc3p -dc3p_w dynamic clue (chunk label trigram+POS)
  297. -dc3x -dc3x_w dynamic clue (chunk trigram+relative position)
  298. =head2 Word alignment (basic mode)
  299. There is another standard setting for word alignment:
  300. uplug align/word/basic -in 1988sven.xml -out basic.links
  301. The word aligner will
  302. * create basic clues (Dice and LCSR)
  303. * run GIZA++ with standard settings (trained on plain text)
  304. * align words with existing clues
  305. Word alignment results are stored in basic.links.
  306. You may look at word type links using the following script:
  307. /path/to/uplug/tools/xces2dic < basic.links | less
  308. =head2 Word alignment (advanced mode)
  309. This settings is similar to the tagged word alignmen settings (3i) but the
  310. last two steps will be repeated 3 times (learning clues from precious
  311. alignments). This is the slowest standard setting for word alignment.
  312. uplug align/word/advanced -in svenprf.xces -out advanced.links
  313. /path/to/uplug/tools/xces2dic < advanced.links | less
  314. =head1 See also
  315. More information on Uplug module configurations: Look at L<Uplug::Config>
  316. More downloads:
  317. L<https://bitbucket.org/tiedemann/uplug>
  318. =cut