PageRenderTime 310ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/uplug-main/bin/uplug-giza

https://bitbucket.org/tiedemann/uplug
Perl | 743 lines | 620 code | 59 blank | 64 comment | 35 complexity | 997e09445f1a1b942958c4bc0ddfd3ba MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. #!/usr/bin/env perl
  2. #-*-perl-*-
  3. #
  4. # giza.pl: wrapper for Giza++
  5. #
  6. #---------------------------------------------------------------------------
  7. # Copyright (C) 2004 Jörg Tiedemann <joerg@stp.ling.uu.se>
  8. #
  9. # This program is free software; you can redistribute it and/or modify
  10. # it under the terms of the GNU General Public License as published by
  11. # the Free Software Foundation; either version 2 of the License, or
  12. # (at your option) any later version.
  13. #
  14. # This program is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program; if not, write to the Free Software
  21. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  22. #---------------------------------------------------------------------------
  23. # $Id$
  24. #----------------------------------------------------------------------------
  25. #
  26. #
  27. use strict;
  28. use Cwd;
  29. use FindBin qw($Bin);
  30. use File::Copy;
  31. use lib "$Bin/../lib";
  32. use strict;
  33. use Uplug::Data;
  34. use Uplug::Data::Align;
  35. use Uplug::IO::Any;
  36. use Uplug::Config;
  37. my $UplugHome="$Bin/../";
  38. $ENV{UPLUGHOME}=$UplugHome;
  39. my $PWD=getcwd;
  40. my $GIZA = &find_executable('GIZA++');
  41. if (not -e $GIZA){warn "cannot find GIZA++\n!";exit;}
  42. my %IniData=&GetDefaultIni;
  43. my $IniFile='giza.ini';
  44. &CheckParameter(\%IniData,\@ARGV,$IniFile);
  45. my $direction=$IniData{parameter}{'alignment direction'};
  46. my $makeclue=$IniData{parameter}{'make clue'};
  47. my $TokenParam=$IniData{parameter}{token};
  48. my $combined=$IniData{parameter}{'symmetric alignment'};
  49. if ($combined){$direction='both';}
  50. #---------------------------------------------------------------------------
  51. my ($InputStreamName,$InputStream)=
  52. each %{$IniData{'input'}}; # the first input stream;
  53. my $ClueDB=$IniData{output}{clue};
  54. delete $IniData{output}{clue};
  55. my $ClueDBinv=$IniData{output}{clue_inv};
  56. delete $IniData{output}{clue_inv};
  57. my ($OutputStreamName,$OutputStream)= # take only
  58. each %{$IniData{'output'}}; # the first output stream
  59. #---------------------------------------------------------------------------
  60. # my $TmpDir=Uplug::IO::Any::GetTempFileName;
  61. my $TmpDir='/tmp/giza'.$$;
  62. mkdir $TmpDir,0755;
  63. my $SrcFile=$TmpDir."/src";
  64. my $TrgFile=$TmpDir."/trg";
  65. my $BitextHeader;
  66. &Bitext2Text($InputStream,$SrcFile,$TrgFile,$TokenParam);
  67. if (($direction eq 'trg-src') or ($direction eq 'both')){
  68. chdir $TmpDir;
  69. &RunGiza($TmpDir,'trg','src');
  70. if ($combined){copy ('GIZA++.A3.final','trg-src.viterbi');}
  71. chdir $PWD;
  72. if ((ref($OutputStream) eq 'HASH') and (not $combined)){
  73. &Giza2Uplug($TmpDir,$InputStream,$TokenParam,$OutputStream,1);
  74. }
  75. if ($makeclue){
  76. &Giza2Clue($TmpDir,$TokenParam,1);
  77. }
  78. }
  79. if (($direction eq 'src-trg') or ($direction eq 'both')){
  80. chdir $TmpDir;
  81. &RunGiza($TmpDir,'src','trg');
  82. if ($combined){copy ('GIZA++.A3.final','src-trg.viterbi');}
  83. chdir $PWD;
  84. if ((ref($OutputStream) eq 'HASH') and (not $combined)){
  85. # if (ref($OutputStream) eq 'HASH'){
  86. &Giza2Uplug($TmpDir,$InputStream,$TokenParam,$OutputStream,0);
  87. }
  88. if ($makeclue){
  89. &Giza2Clue($TmpDir,$TokenParam,0);
  90. }
  91. }
  92. if ($combined){
  93. &Combined2Uplug($TmpDir.'/src-trg.viterbi',
  94. $TmpDir.'/trg-src.viterbi',$combined,
  95. $InputStream,$TokenParam,$OutputStream);
  96. }
  97. #foreach my $d (@align){
  98. # chdir $TmpDir.$d;
  99. # if ($d){&RunGiza($TmpDir.$d,'trg','src');}
  100. # else{&RunGiza($TmpDir.$d,'src','trg');}
  101. # chdir $PWD;
  102. # if (ref($OutputStream) eq 'HASH'){
  103. # &Giza2Uplug($TmpDir,$InputStream,$TokenParam,$OutputStream,$d);
  104. # }
  105. # if ($makeclue){
  106. # &Giza2Clue($TmpDir,$TokenParam,$d);
  107. # }
  108. #}
  109. END{
  110. if ($TmpDir and (-d $TmpDir)){
  111. `rm -f $TmpDir/*`;
  112. `rmdir $TmpDir`;
  113. }
  114. }
  115. sub ReadGizaVoc{
  116. my $file = shift;
  117. my $voc = shift;
  118. open F,"<$file" || die "Cannot open vocabulary file $file!";
  119. binmode(F,":utf8");
  120. while (<F>){
  121. my ($id,$w,$f)=split(/\s+/);
  122. $$voc{$id} = $w;
  123. }
  124. close F;
  125. }
  126. #----------------------------------------------------------------------------
  127. # Giza2Clue (new version): no external calls
  128. # - looks for $dir/GIZA++.actual.ti.final (lexical prob's from GIZA)
  129. # - creates data/runtime/giza.dbm
  130. # - creates data/runtime/giza2.dbm (inverse alignments)
  131. sub Giza2Clue{
  132. my $dir=shift;
  133. my $param=shift;
  134. my $inverse=shift;
  135. my %dic;
  136. if ($inverse and (ref($ClueDBinv) eq 'HASH')){
  137. %dic=%{$ClueDBinv};
  138. }
  139. elsif ((not $inverse) and (ref($ClueDB) eq 'HASH')){
  140. %dic=%{$ClueDB};
  141. }
  142. else{
  143. %dic=('format' => 'dbm',
  144. 'write_mode' => 'overwrite',
  145. 'key' => ['source','target']);
  146. my $cluedir='data/runtime';
  147. if ($inverse){$dic{file}="$cluedir/giza2.dbm";}
  148. else{$dic{file}="$cluedir/giza.dbm";}
  149. }
  150. my (%SrcVoc,%TrgVoc);
  151. &ReadGizaVoc("$dir/src.vcb",\%SrcVoc);
  152. &ReadGizaVoc("$dir/trg.vcb",\%TrgVoc);
  153. my %inStream=('file' => "$dir/GIZA++.t3.final",
  154. 'format' => 'tab',
  155. 'field delimiter' => ' ');
  156. if ($inverse){
  157. $inStream{'columns'}=['target','source','value',],
  158. }
  159. else{
  160. $inStream{'columns'}=['source','target','value',],
  161. }
  162. my %lex=();
  163. my $data=Uplug::Data->new;
  164. my $in=Uplug::IO::Any->new(\%inStream);
  165. $in->open('read',\%inStream);
  166. while ($in->read($data)){
  167. my $src=$SrcVoc{$data->attribute('source')};
  168. my $trg=$TrgVoc{$data->attribute('target')};
  169. if ((not $src) or (not $trg)){next;}
  170. my $value=$data->attribute('value');
  171. if (not $value){$value=1;}
  172. $lex{$src}{$trg}=$value;
  173. if (($src=~s/\_/ /gs) or ($trg=~s/\_/ /gs)){ # (for giza-clue:)
  174. $lex{$src}{$trg}=$value; # '_' means ' '
  175. }
  176. }
  177. my $header=$in->header;
  178. my $out=Uplug::IO::Any->new(\%dic);
  179. $out->open('write',\%dic);
  180. $out->addheader($header);
  181. $out->addheader($param);
  182. $out->writeheader();
  183. foreach my $s (keys %lex){
  184. my $total;
  185. foreach my $t (keys %{$lex{$s}}){
  186. my $score=$lex{$s}{$t};
  187. my $data=Uplug::Data->new;
  188. $data->setAttribute('source',$s);
  189. $data->setAttribute('target',$t);
  190. $data->setAttribute('score',$score);
  191. $out->write($data);
  192. }
  193. }
  194. $out->close;
  195. $in->close;
  196. }
  197. #----------------------------------------------------------------------------
  198. # Giza2Uplug: convert GIZA's Viterbi alignment to Uplug format (XML)
  199. # (slow and risky: GIZA's output must be complete and use a certain format)
  200. sub Giza2Uplug{
  201. my $dir=shift;
  202. my $bitext=shift;
  203. my $param=shift;
  204. my $links=shift;
  205. my $inverse=shift;
  206. if (ref($links) ne 'HASH'){return 0;}
  207. my $input=Uplug::IO::Any->new($bitext);
  208. if (not ref($input)){return 0;}
  209. if (not $input->open('read',$bitext)){return 0;}
  210. my $output=Uplug::IO::Any->new($links);
  211. if (not ref($output)){return 0;}
  212. $output->addheader($BitextHeader);
  213. if (not $output->open('write',$links)){return 0;}
  214. #------------------------------------------------------------------------
  215. my $giza=$dir.'/GIZA++.A3.final';
  216. open F,"<$giza";
  217. #------------------------------------------------------------------------
  218. my $TokenLabel='w';
  219. my $data=Uplug::Data::Align->new();
  220. print STDERR "convert GIZA's Viterbi alignment to XML!\n";
  221. my $count=0;
  222. while ($input->read($data)){
  223. $count++;
  224. if (not ($count % 100)){
  225. $|=1;print STDERR '.';$|=0;
  226. }
  227. if (not ($count % 1000)){
  228. $|=1;print STDERR "$count\n";$|=0;
  229. }
  230. #----------------------------------
  231. # do the same as for Bitext2Text!!
  232. # (to check for empty strings ...)
  233. #
  234. my @SrcNodes=();
  235. my @TrgNodes=();
  236. my ($srctxt,$trgtxt)=
  237. &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
  238. if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
  239. #----------------------------------
  240. # my $SrcData=$data->sourceData();
  241. # my $TrgData=$data->targetData();
  242. #
  243. # my @SrcNodes=$SrcData->findNodes($TokenLabel);
  244. my @SrcIds=$data->attribute(\@SrcNodes,'id');
  245. my @SrcSpans=$data->attribute(\@SrcNodes,'span');
  246. my @SrcTokens=$data->content(\@SrcNodes);
  247. # my @TrgNodes=$TrgData->findNodes($TokenLabel);
  248. my @TrgIds=$data->attribute(\@TrgNodes,'id');
  249. my @TrgSpans=$data->attribute(\@TrgNodes,'span');
  250. my @TrgTokens=$data->content(\@TrgNodes);
  251. if ((not @SrcNodes) or (not @TrgNodes)){next;}
  252. $_=<F>;
  253. $_=<F>;
  254. chomp;
  255. my @src=split(/ /);
  256. $_=<F>;
  257. chomp;
  258. my %align=();
  259. my $count=1;
  260. while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
  261. if ($2){push (@{$align{$2}},$count);}
  262. $count++;
  263. }
  264. foreach (sort keys %align){
  265. my @s;my @t;
  266. if ($inverse){
  267. @t=@{$align{$_}};
  268. @s=split(/\s/);
  269. }
  270. else{
  271. @s=@{$align{$_}};
  272. @t=split(/\s/);
  273. }
  274. my @src=();my @trg=();
  275. foreach (@s){push (@src,$SrcTokens[$_-1]);}
  276. foreach (@t){push (@trg,$TrgTokens[$_-1]);}
  277. my @srcId=();my @trgId=();
  278. foreach (@s){push (@srcId,$SrcIds[$_-1]);}
  279. foreach (@t){push (@trgId,$TrgIds[$_-1]);}
  280. my @srcSpan=();my @trgSpan=();
  281. foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
  282. foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}
  283. my %link=();
  284. $link{link}=join ' ',@src;
  285. $link{link}.=';';
  286. $link{link}.=join ' ',@trg;
  287. $link{source}=join '+',@srcId;
  288. $link{target}=join '+',@trgId;
  289. $link{src}=join '&',@srcSpan;
  290. $link{trg}=join '&',@trgSpan;
  291. $data->addWordLink(\%link);
  292. }
  293. $output->write($data);
  294. }
  295. $input->close;
  296. $output->close;
  297. }
  298. #----------------------------------------------------------------------------
  299. # Combined2Uplug: combine GIZA's Viterbi alignment and convert them to Uplug format (XML)
  300. # (slow and risky: GIZA's output must be complete and must use a certain format)
  301. #
  302. # possible combinatins: union, intersection, refined
  303. #
  304. sub Combined2Uplug{
  305. my $giza0=shift;
  306. my $giza1=shift;
  307. my $combine=shift;
  308. my $bitext=shift;
  309. my $param=shift;
  310. my $links=shift;
  311. if (ref($links) ne 'HASH'){return 0;}
  312. my $input=Uplug::IO::Any->new($bitext);
  313. if (not ref($input)){return 0;}
  314. if (not $input->open('read',$bitext)){return 0;}
  315. my $output=Uplug::IO::Any->new($links);
  316. if (not ref($output)){return 0;}
  317. $output->addheader($BitextHeader);
  318. if (not $output->open('write',$links)){return 0;}
  319. #------------------------------------------------------------------------
  320. open F0,"<$giza0";
  321. open F1,"<$giza1";
  322. #------------------------------------------------------------------------
  323. my $TokenLabel='w';
  324. my $data=Uplug::Data::Align->new();
  325. print STDERR "combine GIZA's Viterbi alignments and convert to XML!\n";
  326. my $count=0;
  327. while ($input->read($data)){
  328. $count++;
  329. if (not ($count % 100)){
  330. $|=1;print STDERR '.';$|=0;
  331. }
  332. if (not ($count % 1000)){
  333. $|=1;print STDERR "$count\n";$|=0;
  334. }
  335. #----------------------------------
  336. # do the same as for Bitext2Text!!
  337. # (to check for empty strings ...)
  338. #
  339. my @SrcNodes=();
  340. my @TrgNodes=();
  341. my ($srctxt,$trgtxt)=
  342. &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
  343. if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
  344. #----------------------------------
  345. # my @SrcNodes=$SrcData->findNodes($TokenLabel);
  346. my @SrcIds=$data->attribute(\@SrcNodes,'id');
  347. my @SrcSpans=$data->attribute(\@SrcNodes,'span');
  348. my @SrcTokens=$data->content(\@SrcNodes);
  349. # my @TrgNodes=$TrgData->findNodes($TokenLabel);
  350. my @TrgIds=$data->attribute(\@TrgNodes,'id');
  351. my @TrgSpans=$data->attribute(\@TrgNodes,'span');
  352. my @TrgTokens=$data->content(\@TrgNodes);
  353. if ((not @SrcNodes) or (not @TrgNodes)){next;}
  354. $_=<F1>;$_=<F1>;chomp; # read source->target viterbi alignment
  355. my @src=split(/ /);
  356. $_=<F1>;chomp;
  357. my %srclinks=();
  358. my $count=1;
  359. while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
  360. my @s=split(/\s/,$2);
  361. foreach (@s){$srclinks{$_}{$count}=1;}
  362. $count++;
  363. }
  364. $_=<F0>;$_=<F0>;chomp; # read source->target viterbi alignment
  365. my @trg=split(/ /);
  366. $_=<F0>;chomp;
  367. my %trglinks=();
  368. my $count=1;
  369. while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
  370. my @t=split(/\s/,$2);
  371. foreach (@t){$trglinks{$_}{$count}=1;}
  372. $count++;
  373. }
  374. my (%CombinedSrc,%CombinedTrg);
  375. &CombineLinks(\%srclinks,\%trglinks,$combine,\%CombinedSrc,\%CombinedTrg);
  376. my @cluster=&LinkClusters(\%CombinedSrc,\%CombinedTrg);
  377. foreach my $c (@cluster){
  378. # my @s=sort {$a <=> $b} keys %{$cluster[$_]{src}};
  379. # my @t=sort {$a <=> $b} keys %{$cluster[$_]{trg}};
  380. my @s=@{$$c{src}};
  381. my @t=@{$$c{trg}};
  382. my @src=();my @trg=();
  383. foreach (@s){push (@src,$SrcTokens[$_-1]);}
  384. foreach (@t){push (@trg,$TrgTokens[$_-1]);}
  385. my @srcId=();my @trgId=();
  386. foreach (@s){push (@srcId,$SrcIds[$_-1]);}
  387. foreach (@t){push (@trgId,$TrgIds[$_-1]);}
  388. my @srcSpan=();my @trgSpan=();
  389. foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
  390. foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}
  391. my %link=();
  392. $link{link}=join ' ',@src;
  393. $link{link}.=';';
  394. $link{link}.=join ' ',@trg;
  395. $link{source}=join '+',@srcId;
  396. $link{target}=join '+',@trgId;
  397. $link{src}=join '&',@srcSpan;
  398. $link{trg}=join '&',@trgSpan;
  399. $data->addWordLink(\%link);
  400. }
  401. $output->write($data);
  402. }
  403. $input->close;
  404. $output->close;
  405. }
  406. sub LinkClusters{
  407. my ($src,$trg)=@_;
  408. my @cluster=();
  409. while (keys %{$src}){
  410. my ($s,$links)=each %{$src}; # get the next source token
  411. if ((ref($$src{$s}) ne 'HASH') or
  412. (not keys %{$$src{$s}})){ # if no links exist:
  413. delete $$src{$s}; # delete and next!
  414. next;
  415. }
  416. push (@cluster,{src=>[],trg=>[]}); # create a new link cluster
  417. push (@{$cluster[-1]{src}},$s); # and save it in the cluster
  418. &AddLinks($cluster[-1],$src,$trg,$s, # add all tokens aligned to the
  419. 'src','trg'); # source token to the cluster
  420. } # (and recursively the ones
  421. foreach my $c (@cluster){
  422. @{$$c{src}}=sort {$a <=> $b} @{$$c{src}};
  423. @{$$c{trg}}=sort {$a <=> $b} @{$$c{trg}};
  424. }
  425. return @cluster;
  426. } # linked to them, see AddLinks)
  427. sub AddLinks{
  428. my ($cluster,$src,$trg,$s,$key1,$key2)=@_;
  429. foreach my $t (keys %{$$src{$s}}){ # add all linked tokens to the
  430. delete $$src{$s}{$t}; # cluster and delete the links
  431. delete $$trg{$t}{$s}; # in the link-hashs
  432. push (@{$$cluster{$key2}},$t);
  433. &AddLinks($cluster,$trg,$src,$t,$key2,$key1); # add tokens aligned to the
  434. } # linked token to the cluster
  435. delete $$src{$s}; # delete the source token link hash
  436. }
  437. sub CombineLinks{
  438. my ($src,$trg,$method,$srclinks,$trglinks)=@_;
  439. # my %srclinks;
  440. # my %trglinks;
  441. if ($method eq 'union'){
  442. foreach my $s (keys %{$src}){
  443. foreach my $t (keys %{$$src{$s}}){
  444. $$srclinks{$s}{$t}=1;
  445. $$trglinks{$t}{$s}=1;
  446. }
  447. }
  448. foreach my $t (keys %{$trg}){
  449. foreach my $s (keys %{$$trg{$t}}){
  450. $$srclinks{$s}{$t}=1;
  451. $$trglinks{$t}{$s}=1;
  452. }
  453. }
  454. }
  455. elsif (($method eq 'intersection') or ($method eq 'refined')){
  456. foreach my $s (keys %{$src}){
  457. foreach my $t (keys %{$$src{$s}}){
  458. if ($$trg{$t}{$s}){
  459. $$srclinks{$s}{$t}=1;
  460. $$trglinks{$t}{$s}=1;
  461. }
  462. }
  463. }
  464. }
  465. if ($method eq 'refined'){ # refined combination:
  466. foreach my $s (keys %{$src}){ # * start with the intersection
  467. foreach my $t (keys %{$$src{$s}}){ # * go iteratively through other links
  468. if ((not defined $$srclinks{$s}) and
  469. (not defined $$trglinks{$t})){ # - if both are not aligned yet:
  470. $$srclinks{$s}{$t}=1; # add the link
  471. $$trglinks{$t}{$s}=1;
  472. }
  473. elsif ((defined $$srclinks{$s-1}) or
  474. (defined $$srclinks{$s+1})){
  475. if (($$srclinks{$s-1}{$t}) or # if the link is adjacent to
  476. ($$srclinks{$s+1}{$t})){ # another one horizontally:
  477. if ($$srclinks{$s}{$t+1}){next;} # do not accept if it is also
  478. if ($$srclinks{$s}{$t-1}){next;} # adjacent to other links vertically
  479. if ($$srclinks{$s-1}{$t}){ # do not accept if the adjacent
  480. if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other
  481. if ($$srclinks{$s-1}{$t+1}){next;} # links vertically
  482. }
  483. if ($$srclinks{$s+1}{$t}){ # the same for the other
  484. if ($$srclinks{$s+1}{$t-1}){next;} # adjacency direction
  485. if ($$srclinks{$s+1}{$t+1}){next;}
  486. }
  487. $$srclinks{$s}{$t}=1; # everything ok: add the link
  488. $$trglinks{$t}{$s}=1;
  489. }
  490. }
  491. elsif ((defined $$trglinks{$t-1}) or
  492. (defined $$trglinks{$t+1})){
  493. if (($$srclinks{$s}{$t-1}) or # if the link is adjacent to
  494. ($$srclinks{$s}{$t+1})){ # another one vertically:
  495. if ($$srclinks{$s+1}{$t}){next;} # do not accept if it is also
  496. if ($$srclinks{$s-1}{$t}){next;} # adjacent to other links horizontally
  497. if ($$srclinks{$s}{$t-1}){ # do not accept if the adjacent
  498. if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other
  499. if ($$srclinks{$s+1}{$t-1}){next;} # links horizontally
  500. }
  501. if ($$srclinks{$s}{$t+1}){ # the same for the other
  502. if ($$srclinks{$s-1}{$t+1}){next;} # adjacency direction
  503. if ($$srclinks{$s+1}{$t+1}){next;}
  504. }
  505. $$srclinks{$s}{$t}=1; # everything ok: add the link
  506. $$trglinks{$t}{$s}=1;
  507. }
  508. }
  509. }
  510. }
  511. }
  512. # $src=\%srclinks;
  513. # $trg=\%trglinks;
  514. }
  515. #----------------------------------------------------------------------------
  516. # RunGiza: run GIZA++ using external scripts
  517. # (GIZA must be installed in the given directory)
  518. sub RunGiza{
  519. my $dir=shift;
  520. my $src=shift;
  521. my $trg=shift;
  522. my $plain2snt = &find_executable('plain2snt.out');
  523. my $snt2cooc = &find_executable('snt2cooc.out');
  524. my $mkcls = &find_executable('mkcls');
  525. my $GIZA = &find_executable('GIZA++');
  526. if (my $sig=system "$plain2snt $src $trg"){
  527. die "got signal $? from plain2snt!\n";
  528. }
  529. my $snt="$src\_$trg\.snt";
  530. if (my $sig=system "$snt2cooc $src.vcb $trg.vcb $snt > src_trg.cooc"){
  531. die "got signal $? from snt2plain!\n";
  532. }
  533. if (my $sig=system "$mkcls -m2 -p$src -c50 -V$src.vcb.classes opt >& mkcls1.log"){
  534. die "got signal $? from mkcls (src)!\n";
  535. }
  536. if (my $sig=system "$mkcls -m2 -p$trg -c50 -V$trg.vcb.classes opt >& mkcls2.log"){
  537. die "got signal $? from mkcls (trg)!\n";
  538. }
  539. if (my $sig=system "$GIZA -S $src.vcb -T $trg.vcb -C $snt -p0 0.98 -cooc src_trg.cooc -o GIZA++ >& GIZA.log"){
  540. die "got signal $? from GIZA++!\n";
  541. }
  542. }
  543. #----------------------------------------------------------------------------
  544. # Bitext2Text: convert bitexts from Uplug format (XML) to GIZA's format
  545. # (this is much too slow ....)
  546. sub Bitext2Text{
  547. my $bitext=shift;
  548. my $srcfile=shift;
  549. my $trgfile=shift;
  550. my $param=shift;
  551. my %SrcStream=('format'=>'text','file'=>$srcfile);
  552. my %TrgStream=('format'=>'text','file'=>$trgfile);
  553. my $input=Uplug::IO::Any->new($bitext);
  554. my $source=Uplug::IO::Any->new(\%SrcStream);
  555. my $target=Uplug::IO::Any->new(\%TrgStream);
  556. $input->open('read',$bitext);
  557. $source->open('write',\%SrcStream);
  558. $target->open('write',\%TrgStream);
  559. #-------------------------------------------------------------------------
  560. my $data=Uplug::Data::Align->new();
  561. print STDERR "convert bitext to plain text!\n";
  562. my $count=0;
  563. while ($input->read($data)){
  564. $count++;
  565. if (not ($count % 100)){
  566. $|=1;print STDERR '.';$|=0;
  567. }
  568. if (not ($count % 1000)){
  569. $|=1;print STDERR "$count\n";$|=0;
  570. }
  571. my ($srctxt,$trgtxt)=&BitextStrings($data,$param);
  572. if (($srctxt=~/\S/) and ($trgtxt=~/\S/)){
  573. $source->write($srctxt);
  574. $target->write($trgtxt);
  575. }
  576. }
  577. $BitextHeader=$input->header;
  578. $input->close;
  579. $source->close;
  580. $target->close;
  581. }
  582. #----------------------------------------------------------------------------
  583. # get the actual strings from the bitext (using feature-parameters)
  584. # (feature specifications as in coocfreq.pl)
  585. sub BitextStrings{
  586. my $data=shift;
  587. my $param=shift;
  588. my ($srcnodes,$trgnodes)=@_;
  589. my @srctok=$data->getSrcTokenFeatures($param,$srcnodes);
  590. my @trgtok=$data->getTrgTokenFeatures($param,$trgnodes);
  591. map($_=~s/^\s+//sg,@srctok); # delete initial white-space
  592. map($_=~s/^\s+//sg,@trgtok);
  593. map($_=~s/(\S)\s+$/$1/sg,@srctok); # delete final white-space
  594. map($_=~s/(\S)\s+$/$1/sg,@trgtok);
  595. map($_=~s/\n/ /sg,@srctok); # otherwise: convert to space
  596. map($_=~s/\n/ /sg,@trgtok);
  597. map($_=~s/\s/\_/sg,@srctok); # and replace space with underline
  598. map($_=~s/\s/\_/sg,@trgtok); # (to avoid extra tokens)
  599. my $srctxt=join(' ',@srctok);
  600. my $trgtxt=join(' ',@trgtok);
  601. $srctxt=~tr/\n/ /;
  602. $trgtxt=~tr/\n/ /;
  603. return ($srctxt,$trgtxt);
  604. }
  605. #----------------------------------------------------------------------------
  606. sub GetDefaultIni{
  607. my $DefaultIni = {
  608. 'module' => {
  609. 'name' => 'run giza',
  610. 'program' => 'giza.pl',
  611. 'location' => '$UplugBin',
  612. 'stdout' => 'bitext',
  613. },
  614. 'input' => {
  615. 'bitext' => {
  616. 'format' => 'xces align',
  617. },
  618. },
  619. 'output' => {
  620. 'bitext' => {
  621. 'format' => 'xces align',
  622. 'write_mode' => 'overwrite',
  623. }
  624. },
  625. 'parameter' => {
  626. 'alignment direction' => 'src-trg', # alt.: 'trg-src' or 'both'
  627. },
  628. 'arguments' => {
  629. 'shortcuts' => {
  630. 'in' => 'input:bitext:file',
  631. 'out' => 'output:bitext:file',
  632. 'd' => 'parameter:alignment direction',
  633. 'c' => 'parameter:symmetric alignment'
  634. }
  635. },
  636. };
  637. return %{$DefaultIni};
  638. }