PageRenderTime 78ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/uplug-main/lib/Uplug/Align/Word/Giza.pm

https://bitbucket.org/tiedemann/uplug
Perl | 795 lines | 598 code | 98 blank | 99 comment | 101 complexity | 34c25d8107de0246175b6d12ef0a6418 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. #-*-perl-*-
  2. package Uplug::Align::Word::Giza;
  3. use strict;
  4. use Cwd;
  5. use Exporter;
  6. use File::Copy;
  7. use Uplug::Data;
  8. use Uplug::Data::Align;
  9. use Uplug::IO::Any;
  10. use Uplug::Config;
  11. use vars qw(@ISA @EXPORT $DEBUG $GIZAHOME);
  12. # @ISA = qw( Uplug::Align::Word Exporter );
  13. @ISA = qw( Exporter);
  14. $DEBUG = 0;
  15. @EXPORT = qw( &Bitext2Text &RunGiza &Combined2Uplug &Giza2Clue &Giza2Uplug );
  16. #BEGIN{
  17. our $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++";
  18. if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";}
  19. if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";}
  20. if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";}
  21. if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";}
  22. #}
  23. # if (not -d $GIZAHOME){warn "cannot find GIZA++!";exit;}
  24. #----------------------------------------------------------------------------
  25. # Giza2Clue (new version): no external calls
  26. # - looks for $dir/GIZA++.actual.ti.final (lexical prob's from GIZA)
  27. # - creates data/runtime/giza.dbm
  28. # - creates data/runtime/giza2.dbm (inverse alignments)
  29. sub Giza2Clue{
  30. my $file=shift;
  31. my $param=shift;
  32. my $inverse=shift;
  33. my $ClueDB=shift;
  34. my $threshold=shift;
  35. if (-d $file){ # if file is a directory
  36. $file="$file/GIZA++.actual.ti.final"; # look for the standard file in
  37. } # this directory!
  38. my %dic;
  39. if (ref($ClueDB) eq 'HASH'){
  40. %dic=%{$ClueDB};
  41. }
  42. else{
  43. %dic=('format' => 'dbm',
  44. 'write_mode' => 'overwrite',
  45. 'key' => ['source','target']);
  46. my $cluedir='data/runtime';
  47. if ($inverse){$dic{file}="$cluedir/giza2.dbm";}
  48. else{$dic{file}="$cluedir/giza.dbm";}
  49. }
  50. my %inStream=('file' => $file,
  51. 'format' => 'tab',
  52. 'field delimiter' => ' ');
  53. if ($inverse){
  54. $inStream{'columns'}=['source','target','value',],
  55. }
  56. else{
  57. $inStream{'columns'}=['target','source','value',],
  58. }
  59. my %lex=();
  60. my $data=Uplug::Data->new;
  61. my $in=Uplug::IO::Any->new(\%inStream);
  62. $in->open('read',\%inStream);
  63. my $count=0;
  64. while ($in->read($data)){
  65. $count++;
  66. if (not ($count % 1000)){print STDERR '.';}
  67. if (not ($count % 10000)){print STDERR "$count\n";}
  68. my $src=$data->attribute('source');
  69. my $trg=$data->attribute('target');
  70. if ((not $src) or (not $trg)){next;}
  71. my $value=$data->attribute('value');
  72. if (not $value){$value=1;}
  73. if ((defined $threshold) and ($value<$threshold)){next;}
  74. $lex{$src}{$trg}=$value;
  75. if (($src=~s/\_/ /gs) or ($trg=~s/\_/ /gs)){ # (for giza-clue:)
  76. $lex{$src}{$trg}=$value; # '_' means ' '
  77. }
  78. }
  79. my $header=$in->header;
  80. my $out=Uplug::IO::Any->new(\%dic);
  81. $out->open('write',\%dic);
  82. $out->addheader($header);
  83. $out->addheader($param);
  84. $out->writeheader();
  85. foreach my $s (keys %lex){
  86. my $total;
  87. foreach my $t (keys %{$lex{$s}}){
  88. my $score=$lex{$s}{$t};
  89. my $data=Uplug::Data->new;
  90. $data->setAttribute('source',$s);
  91. $data->setAttribute('target',$t);
  92. $data->setAttribute('score',$score);
  93. $out->write($data);
  94. }
  95. }
  96. $out->close;
  97. $in->close;
  98. }
  99. #----------------------------------------------------------------------------
  100. # Giza2Uplug: convert GIZA's Viterbi alignment to Uplug format (XML)
  101. # (slow and risky: GIZA's output must be complete and use a certain format)
  102. sub Giza2Uplug{
  103. my $viterbi=shift;
  104. my $bitext=shift;
  105. my $param=shift;
  106. my $links=shift;
  107. my $inverse=shift;
  108. if (ref($links) ne 'HASH'){return 0;}
  109. my $input=Uplug::IO::Any->new($bitext);
  110. if (not ref($input)){return 0;}
  111. if (not $input->open('read',$bitext)){return 0;}
  112. my $BitextHeader=$input->header();
  113. my $output=Uplug::IO::Any->new($links);
  114. if (not ref($output)){return 0;}
  115. $output->addheader($BitextHeader);
  116. if (not $output->open('write',$links)){return 0;}
  117. $output->setOption('SkipDataHeader',0); # don't skip headers and
  118. $output->setOption('SkipDataTail',0); # footers (e.g. <linkGrp>)
  119. #------------------------------------------------------------------------
  120. if ($viterbi=~/\.gz$/){
  121. open F,"gzip -cd <$viterbi |" ||
  122. die "cannot open Viterbi alignment file $viterbi!";
  123. }
  124. else{
  125. open F,"<$viterbi" ||
  126. die "cannot open Viterbi alignment file $viterbi!";
  127. }
  128. #------------------------------------------------------------------------
  129. my $TokenLabel='w';
  130. my $data=Uplug::Data::Align->new();
  131. print STDERR "convert GIZA's Viterbi alignment to XML!\n";
  132. my $count=0;
  133. while ($input->read($data)){
  134. $count++;
  135. if (not ($count % 100)){
  136. $|=1;print STDERR '.';$|=0;
  137. }
  138. if (not ($count % 1000)){
  139. $|=1;print STDERR "$count\n";$|=0;
  140. }
  141. #----------------------------------
  142. # do the same as for Bitext2Text!!
  143. # (to check for empty strings ...)
  144. #
  145. my @SrcNodes=();
  146. my @TrgNodes=();
  147. my ($srctxt,$trgtxt)=
  148. &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
  149. if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
  150. #----------------------------------
  151. # my $SrcData=$data->sourceData();
  152. # my $TrgData=$data->targetData();
  153. #
  154. # my @SrcNodes=$SrcData->findNodes($TokenLabel);
  155. my @SrcIds=$data->attribute(\@SrcNodes,'id');
  156. my @SrcSpans=$data->attribute(\@SrcNodes,'span');
  157. # my @SrcTokens=$data->content(\@SrcNodes);
  158. my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes);
  159. # my @TrgNodes=$TrgData->findNodes($TokenLabel);
  160. my @TrgIds=$data->attribute(\@TrgNodes,'id');
  161. my @TrgSpans=$data->attribute(\@TrgNodes,'span');
  162. # my @TrgTokens=$data->content(\@TrgNodes);
  163. my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes);
  164. if ((not @SrcNodes) or (not @TrgNodes)){next;}
  165. $_=<F>;
  166. $_=<F>;
  167. chomp;
  168. my @src=split(/ /);
  169. $_=<F>;
  170. chomp;
  171. my %align=();
  172. my $count=1;
  173. while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
  174. if ($2){push (@{$align{$2}},$count);}
  175. $count++;
  176. }
  177. foreach (sort keys %align){
  178. my @s;my @t;
  179. if ($inverse){
  180. @t=@{$align{$_}};
  181. @s=split(/\s/);
  182. }
  183. else{
  184. @s=@{$align{$_}};
  185. @t=split(/\s/);
  186. }
  187. my @src=();my @trg=();
  188. foreach (@s){push (@src,$SrcTokens[$_-1]);}
  189. foreach (@t){push (@trg,$TrgTokens[$_-1]);}
  190. my @srcId=();my @trgId=();
  191. foreach (@s){push (@srcId,$SrcIds[$_-1]);}
  192. foreach (@t){push (@trgId,$TrgIds[$_-1]);}
  193. my @srcSpan=();my @trgSpan=();
  194. foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
  195. foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}
  196. my %link=();
  197. $link{link}=join ' ',@src;
  198. $link{link}.=';';
  199. $link{link}.=join ' ',@trg;
  200. $link{source}=join '+',@srcId;
  201. $link{target}=join '+',@trgId;
  202. $link{src}=join '&',@srcSpan;
  203. $link{trg}=join '&',@trgSpan;
  204. $data->addWordLink(\%link);
  205. }
  206. $output->write($data);
  207. }
  208. $input->close;
  209. $output->close;
  210. }
  211. #----------------------------------------------------------------------------
  212. # Combined2Uplug: combine GIZA's Viterbi alignment and convert them to Uplug format (XML)
  213. # (slow and risky: GIZA's output must be complete and must use a certain format)
  214. #
  215. # possible combinatins: union, intersection, refined
  216. #
  217. sub Combined2Uplug{
  218. my $giza0=shift;
  219. my $giza1=shift;
  220. my $combine=shift;
  221. my $bitext=shift;
  222. my $param=shift;
  223. my $links=shift;
  224. if (ref($links) ne 'HASH'){return 0;}
  225. my $input=Uplug::IO::Any->new($bitext);
  226. if (not ref($input)){return 0;}
  227. if (not $input->open('read',$bitext)){return 0;}
  228. my $output; # open output later (after reading the first record of
  229. # the input stream --> this gives the complete input header)
  230. #------------------------------------------------------------------------
  231. if ($giza0=~/\.gz$/){open F0,"gzip -cd <$giza0 |";}
  232. else{open F0,"<$giza0";}
  233. # ($]>=5.008){binmode(F0, ":encoding(utf-8)");}
  234. if ($giza1=~/\.gz$/){open F1,"gzip -cd <$giza1 |";}
  235. else{open F1,"<$giza1";}
  236. # ($]>=5.008){binmode(F1, ":encoding(utf-8)");}
  237. #------------------------------------------------------------------------
  238. my $TokenLabel='w';
  239. my $data=Uplug::Data::Align->new();
  240. print STDERR "combine GIZA's Viterbi alignments and convert to XML!\n";
  241. my $count=0;
  242. while ($input->read($data)){
  243. if (not $output){ # output is not opened yet:
  244. my $BitextHeader=$input->header(); # - get the input header
  245. $output=Uplug::IO::Any->new($links); # - create an output stream
  246. if (not ref($output)){return 0;} # (or die)
  247. $output->addheader($BitextHeader); # - add input header
  248. if (not $output->open('write',$links)){ # - open the output stream
  249. return 0; # (or die)
  250. }
  251. $output->setOption('SkipDataHeader',0); # don't skip headers and
  252. $output->setOption('SkipDataTail',0); # footers (e.g. <linkGrp>)
  253. }
  254. $count++;
  255. if (not ($count % 100)){
  256. $|=1;print STDERR '.';$|=0;
  257. }
  258. if (not ($count % 1000)){
  259. $|=1;print STDERR "$count\n";$|=0;
  260. }
  261. #----------------------------------
  262. # do the same as for Bitext2Text!!
  263. # (to check for empty strings ...)
  264. #
  265. my @SrcNodes=();
  266. my @TrgNodes=();
  267. my ($srctxt,$trgtxt)=
  268. &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
  269. if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
  270. #----------------------------------
  271. # my @SrcNodes=$SrcData->findNodes($TokenLabel);
  272. my @SrcIds=$data->attribute(\@SrcNodes,'id');
  273. my @SrcSpans=$data->attribute(\@SrcNodes,'span');
  274. # my @SrcTokens=$data->content(\@SrcNodes);
  275. my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes);
  276. # my @TrgNodes=$TrgData->findNodes($TokenLabel);
  277. my @TrgIds=$data->attribute(\@TrgNodes,'id');
  278. my @TrgSpans=$data->attribute(\@TrgNodes,'span');
  279. # my @TrgTokens=$data->content(\@TrgNodes);
  280. my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes);
  281. if ((not @SrcNodes) or (not @TrgNodes)){next;}
  282. $_=<F1>;$_=<F1>;chomp; # read source->target viterbi alignment
  283. my @src=split(/ /);
  284. $_=<F1>;chomp;
  285. my %srclinks=();
  286. my $count=1;
  287. while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
  288. my @s=split(/\s/,$2);
  289. foreach (@s){$srclinks{$_}{$count}=1;}
  290. $count++;
  291. }
  292. $_=<F0>;$_=<F0>;chomp; # read source->target viterbi alignment
  293. my @trg=split(/ /);
  294. $_=<F0>;chomp;
  295. my %trglinks=();
  296. my $count=1;
  297. while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
  298. my @t=split(/\s/,$2);
  299. foreach (@t){$trglinks{$_}{$count}=1;}
  300. $count++;
  301. }
  302. my (%CombinedSrc,%CombinedTrg);
  303. &CombineLinks(\%srclinks,\%trglinks,$combine,\%CombinedSrc,\%CombinedTrg);
  304. my @cluster=&LinkClusters(\%CombinedSrc,\%CombinedTrg);
  305. foreach my $c (@cluster){
  306. # my @s=sort {$a <=> $b} keys %{$cluster[$_]{src}};
  307. # my @t=sort {$a <=> $b} keys %{$cluster[$_]{trg}};
  308. my @s=@{$$c{src}};
  309. my @t=@{$$c{trg}};
  310. my @src=();my @trg=();
  311. foreach (@s){push (@src,$SrcTokens[$_-1]);}
  312. foreach (@t){push (@trg,$TrgTokens[$_-1]);}
  313. my @srcId=();my @trgId=();
  314. foreach (@s){push (@srcId,$SrcIds[$_-1]);}
  315. foreach (@t){push (@trgId,$TrgIds[$_-1]);}
  316. my @srcSpan=();my @trgSpan=();
  317. foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
  318. foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}
  319. my %link=();
  320. $link{link}=join ' ',@src;
  321. $link{link}.=';';
  322. $link{link}.=join ' ',@trg;
  323. $link{source}=join '+',@srcId;
  324. $link{target}=join '+',@trgId;
  325. $link{src}=join '&',@srcSpan;
  326. $link{trg}=join '&',@trgSpan;
  327. $data->addWordLink(\%link);
  328. }
  329. $output->write($data);
  330. }
  331. $input->close;
  332. $output->close;
  333. }
  334. sub LinkClusters{
  335. my ($src,$trg)=@_;
  336. my @cluster=();
  337. while (keys %{$src}){
  338. my ($s,$links)=each %{$src}; # get the next source token
  339. if ((ref($$src{$s}) ne 'HASH') or
  340. (not keys %{$$src{$s}})){ # if no links exist:
  341. delete $$src{$s}; # delete and next!
  342. next;
  343. }
  344. push (@cluster,{src=>[],trg=>[]}); # create a new link cluster
  345. push (@{$cluster[-1]{src}},$s); # and save it in the cluster
  346. &AddLinks($cluster[-1],$src,$trg,$s, # add all tokens aligned to the
  347. 'src','trg'); # source token to the cluster
  348. } # (and recursively the ones
  349. foreach my $c (@cluster){
  350. @{$$c{src}}=sort {$a <=> $b} @{$$c{src}};
  351. @{$$c{trg}}=sort {$a <=> $b} @{$$c{trg}};
  352. }
  353. return @cluster;
  354. } # linked to them, see AddLinks)
  355. sub AddLinks{
  356. my ($cluster,$src,$trg,$s,$key1,$key2)=@_;
  357. foreach my $t (keys %{$$src{$s}}){ # add all linked tokens to the
  358. delete $$src{$s}{$t}; # cluster and delete the links
  359. delete $$trg{$t}{$s}; # in the link-hashs
  360. push (@{$$cluster{$key2}},$t);
  361. &AddLinks($cluster,$trg,$src,$t,$key2,$key1); # add tokens aligned to the
  362. } # linked token to the cluster
  363. delete $$src{$s}; # delete the source token link hash
  364. }
  365. sub CombineLinks{
  366. my ($src,$trg,$method,$srclinks,$trglinks)=@_;
  367. # my %srclinks;
  368. # my %trglinks;
  369. if ($method eq 'union'){
  370. foreach my $s (keys %{$src}){
  371. foreach my $t (keys %{$$src{$s}}){
  372. $$srclinks{$s}{$t}=1;
  373. $$trglinks{$t}{$s}=1;
  374. }
  375. }
  376. foreach my $t (keys %{$trg}){
  377. foreach my $s (keys %{$$trg{$t}}){
  378. $$srclinks{$s}{$t}=1;
  379. $$trglinks{$t}{$s}=1;
  380. }
  381. }
  382. }
  383. elsif ($method eq 'intersection'){
  384. foreach my $s (keys %{$src}){
  385. foreach my $t (keys %{$$src{$s}}){
  386. if (exists $$trg{$t}{$s}){
  387. $$srclinks{$s}{$t}=1;
  388. $$trglinks{$t}{$s}=1;
  389. }
  390. }
  391. }
  392. }
  393. if ($method eq 'refined'){ # refined combination:
  394. my %links=();
  395. foreach my $s (keys %{$src}){ # 1) start with intersection
  396. foreach my $t (keys %{$$src{$s}}){
  397. if (exists $$trg{$t}{$s}){
  398. $$srclinks{$s}{$t}=1;
  399. $$trglinks{$t}{$s}=1;
  400. }
  401. else{
  402. $links{"$s:$t"}=1; # keep union of links
  403. }
  404. }
  405. }
  406. foreach my $t (keys %{$trg}){
  407. foreach my $s (keys %{$$trg{$t}}){
  408. if (not exists $$src{$s}{$t}){
  409. $links{"$s:$t"}=1;
  410. }
  411. }
  412. }
  413. add_unaligned(\%links,$srclinks,$trglinks); # 2) add unaligned pairs
  414. add_adjacent(\%links,$srclinks,$trglinks); # 3) add adjacent links
  415. }
  416. # $src=\%srclinks;
  417. # $trg=\%trglinks;
  418. }
  419. sub is_diagonal{
  420. my ($s,$t,$srclinks,$trglinks)=@_;
  421. if (defined $$srclinks{$s-1}){
  422. return 1 if (defined $$trglinks{$t-1});
  423. return 1 if (defined $$trglinks{$t+1});
  424. }
  425. if (defined $$srclinks{$s+1}){
  426. return 1 if (defined $$trglinks{$t-1});
  427. return 1 if (defined $$trglinks{$t+1});
  428. }
  429. return 0;
  430. }
  431. sub is_adjacent{
  432. my ($s,$t,$srclinks,$trglinks)=@_;
  433. if (exists $$srclinks{$s}){
  434. return 1 if (exists $$srclinks{$s}{$t-1});
  435. return 1 if (exists $$srclinks{$s}{$t+1});
  436. }
  437. elsif (exists $$trglinks{$t}){
  438. return 1 if (exists $$trglinks{$t}{$s-1});
  439. return 1 if (exists $$trglinks{$t}{$s+1});
  440. }
  441. return 0;
  442. }
  443. sub add_adjacent{
  444. my $links=shift;
  445. my $srclinks=shift;
  446. my $trglinks=shift;
  447. my $add=1;
  448. while (%{$links} && $add){
  449. $add=0;
  450. foreach my $l (keys %{$links}){
  451. my ($l) = each %{$links};
  452. my ($s,$t) = split(/:/,$l);
  453. next if (exists $$srclinks{$s} && exists $$trglinks{$t});
  454. if (is_adjacent($s,$t,$srclinks,$trglinks)){
  455. $$srclinks{$s}{$t}=1;
  456. $$trglinks{$t}{$s}=1;
  457. delete $$links{$l};
  458. $add++;
  459. }
  460. }
  461. }
  462. }
  463. sub is_unaligned{
  464. my ($s,$t,$srclinks,$trglinks)=@_;
  465. if (not exists $$srclinks{$s}){
  466. return 1 if (not exists $$trglinks{$t});
  467. }
  468. return 0;
  469. }
  470. sub add_unaligned{
  471. my $links=shift;
  472. my $srclinks=shift;
  473. my $trglinks=shift;
  474. foreach my $l (keys %{$links}){
  475. my ($s,$t) = split(/:/,$l);
  476. if (is_unaligned($s,$t,$srclinks,$trglinks)){
  477. $$srclinks{$s}{$t}=1;
  478. $$trglinks{$t}{$s}=1;
  479. delete $$links{$l};
  480. }
  481. }
  482. }
  483. sub add_adjacent_old{
  484. my $links=shift;
  485. my $srclinks=shift;
  486. my $trglinks=shift;
  487. foreach my $s (0..$#{$links}){
  488. foreach my $t (0..$#{$$links[$s]}){
  489. next if (not $$links[$s][$t]);
  490. if ((not defined $$srclinks{$s}) and
  491. (not defined $$trglinks{$t})){ # - if both are not aligned yet:
  492. $$srclinks{$s}{$t}=1; # add the link
  493. $$trglinks{$t}{$s}=1;
  494. }
  495. elsif ((defined $$srclinks{$s-1}) or
  496. (defined $$srclinks{$s+1})){
  497. if (($$srclinks{$s-1}{$t}) or # if the link is adjacent to
  498. ($$srclinks{$s+1}{$t})){ # another one horizontally:
  499. if ($$srclinks{$s}{$t+1}){next;} # do not accept if it is also
  500. if ($$srclinks{$s}{$t-1}){next;} # adjacent to other links vertically
  501. if ($$srclinks{$s-1}{$t}){ # do not accept if the adjacent
  502. if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other
  503. if ($$srclinks{$s-1}{$t+1}){next;} # links vertically
  504. }
  505. if ($$srclinks{$s+1}{$t}){ # the same for the other
  506. if ($$srclinks{$s+1}{$t-1}){next;} # adjacency direction
  507. if ($$srclinks{$s+1}{$t+1}){next;}
  508. }
  509. $$srclinks{$s}{$t}=1; # everything ok: add the link
  510. $$trglinks{$t}{$s}=1;
  511. }
  512. }
  513. elsif ((defined $$trglinks{$t-1}) or
  514. (defined $$trglinks{$t+1})){
  515. if (($$srclinks{$s}{$t-1}) or # if the link is adjacent to
  516. ($$srclinks{$s}{$t+1})){ # another one vertically:
  517. if ($$srclinks{$s+1}{$t}){next;} # do not accept if it is also
  518. if ($$srclinks{$s-1}{$t}){next;} # adjacent to other links horizontally
  519. if ($$srclinks{$s}{$t-1}){ # do not accept if the adjacent
  520. if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other
  521. if ($$srclinks{$s+1}{$t-1}){next;} # links horizontally
  522. }
  523. if ($$srclinks{$s}{$t+1}){ # the same for the other
  524. if ($$srclinks{$s-1}{$t+1}){next;} # adjacency direction
  525. if ($$srclinks{$s+1}{$t+1}){next;}
  526. }
  527. $$srclinks{$s}{$t}=1; # everything ok: add the link
  528. $$trglinks{$t}{$s}=1;
  529. }
  530. }
  531. }
  532. }
  533. }
  534. #----------------------------------------------------------------------------
  535. # RunGiza: run GIZA++ using external scripts
  536. # (GIZA must be installed in the given directory)
  537. sub RunGiza{
  538. my $src=shift;
  539. my $trg=shift;
  540. my $viterbi=shift;
  541. if (not -d $GIZAHOME){
  542. $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++";
  543. if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";}
  544. if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";}
  545. if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";}
  546. if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";}
  547. if (not -d $GIZAHOME){
  548. warn "cannot find GIZA++ in $GIZAHOME! Set ENV{UPLUGHOME} or Uplug::Align::Word::Giza::GIZAHOME!";
  549. return 0;
  550. }
  551. }
  552. if (my $sig=system "$GIZAHOME/plain2snt.out $src $trg"){
  553. die "got signal $? from plain2snt!\n";
  554. }
  555. my $command="PATH=\$\{PATH\}:$GIZAHOME;";
  556. my $snt="$src$trg\.snt";
  557. if (not -e $snt){$snt="$src\_$trg\.snt";}
  558. if (not -e $snt){die "cannot find alignment-file: $snt!\n";}
  559. $command.="$GIZAHOME/trainGIZA++.sh $src\.vcb $trg\.vcb $snt";
  560. if (my $sig=system $command){
  561. die "got signal $? from trainGIZA++.sh!\n";
  562. }
  563. if ($viterbi){copy ('GIZA++.A3.final',$viterbi);}
  564. else{copy ('GIZA++.A3.final','viterbi');}
  565. }
  566. #----------------------------------------------------------------------------
  567. # Bitext2Text: convert bitexts from Uplug format (XML) to GIZA's format
  568. # (this is much too slow ....)
  569. sub Bitext2Text{
  570. my $bitext=shift;
  571. my $srcfile=shift;
  572. my $trgfile=shift;
  573. my $param=shift;
  574. my %SrcStream=('format'=>'text','file'=>$srcfile);
  575. my %TrgStream=('format'=>'text','file'=>$trgfile);
  576. my $input=Uplug::IO::Any->new($bitext);
  577. my $source=Uplug::IO::Any->new(\%SrcStream);
  578. my $target=Uplug::IO::Any->new(\%TrgStream);
  579. $input->open('read',$bitext)
  580. || warn "cannot open the bitext" && return 0;
  581. $source->open('write',\%SrcStream)
  582. || warn "cannot write to $srcfile" && return 0;
  583. $target->open('write',\%TrgStream)
  584. || warn "cannot write to $trgfile" && return 0;
  585. #-------------------------------------------------------------------------
  586. my $data=Uplug::Data::Align->new();
  587. print STDERR "convert bitext to plain text!\n";
  588. my $count=0;
  589. while ($input->read($data)){
  590. $count++;
  591. if (not ($count % 100)){
  592. $|=1;print STDERR '.';$|=0;
  593. }
  594. if (not ($count % 1000)){
  595. $|=1;print STDERR "$count\n";$|=0;
  596. }
  597. my ($srctxt,$trgtxt)=&BitextStrings($data,$param);
  598. if (($srctxt=~/\S/) and ($trgtxt=~/\S/)){
  599. $source->write($srctxt);
  600. $target->write($trgtxt);
  601. }
  602. }
  603. # $BitextHeader=$input->header;
  604. $input->close;
  605. $source->close;
  606. $target->close;
  607. return $input->header;
  608. }
  609. #----------------------------------------------------------------------------
  610. # get the actual strings from the bitext (using feature-parameters)
  611. # (feature specifications as in coocfreq.pl)
  612. sub BitextStrings{
  613. my $data=shift;
  614. my $param=shift;
  615. my ($srcnodes,$trgnodes)=@_;
  616. my @srctok=$data->getSrcTokenFeatures($param,$srcnodes);
  617. my @trgtok=$data->getTrgTokenFeatures($param,$trgnodes);
  618. map($_=~s/^\s+//sg,@srctok); # delete initial white-space
  619. map($_=~s/^\s+//sg,@trgtok);
  620. map($_=~s/(\S)\s+$/$1/sg,@srctok); # delete final white-space
  621. map($_=~s/(\S)\s+$/$1/sg,@trgtok);
  622. map($_=~s/\n/ /sg,@srctok); # otherwise: convert to space
  623. map($_=~s/\n/ /sg,@trgtok);
  624. map($_=~s/\s/\_/sg,@srctok); # and replace space with underline
  625. map($_=~s/\s/\_/sg,@trgtok); # (to avoid extra tokens)
  626. my $srctxt=join(' ',@srctok);
  627. my $trgtxt=join(' ',@trgtok);
  628. $srctxt=~tr/\n/ /;
  629. $trgtxt=~tr/\n/ /;
  630. return ($srctxt,$trgtxt);
  631. }
  632. #----------------------------------------------------------------------------
  633. =pod
  634. =head1 Synopsis
  635. use lib '/path/to/uplug';
  636. use Uplug::Align::Word::Giza;
  637. $ENV{UPLUGHOME}='/path/to/uplug';
  638. my %bitext = ('file' => 'svenprf.xces',
  639. 'format' => 'xces align');
  640. &Bitext2Text(\%bitext,'src','trg',{}); # convert to plain text
  641. &RunGiza('src','trg','viterbi.src-trg'); # run GIZA++ (src-->trg)
  642. &RunGiza('trg','src','viterbi.trg-src'); # run GIZA++ (trg-->src)
  643. my %dbm = (file=>'clues.dbm',
  644. format=>'dbm',
  645. key => ['source','target']);
  646. &Giza2Clue('.', # directory where GIZA was running (=current)
  647. {}, # parameter (= clue dbm header)
  648. 1, # =1 --> inverse (trg-src)
  649. \%dbm); # clue DBM (giza.dbm if not specified)
  650. my $combine = 'intersection' # combine heuristics (union|refined)
  651. my %out = ('file' => $combined, # save the result in this file
  652. 'format' => 'xces align');
  653. &Combined2Uplug('viterbi.src-trg', # name of first viterbi alignment
  654. 'viterbi.trg-src', # name of second viterbi alignment
  655. $combine, # type of combination heuristics
  656. \%bitext, # bitext
  657. {}, # token parameters
  658. \%out); # output stream
  659. =head1 GIZA++ directories
  660. You have to tell the program where it can find the GIZA++ executables.
  661. You can either set the UPLUGHOME environment variable or the GIZAHOME variable.
  662. $ENV{UPLUGHOME}='/path/to/uplug';
  663. $Uplug::Align::Word::Giza::GIZAHOME="/path/to/uplug/ext/GIZA++";
  664. =cut
  665. 1;