/uplug-main/lib/Uplug/Align/Word/Giza.pm
Perl | 795 lines | 598 code | 98 blank | 99 comment | 101 complexity | 34c25d8107de0246175b6d12ef0a6418 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
- #-*-perl-*-
- package Uplug::Align::Word::Giza;
- use strict;
- use Cwd;
- use Exporter;
- use File::Copy;
- use Uplug::Data;
- use Uplug::Data::Align;
- use Uplug::IO::Any;
- use Uplug::Config;
- use vars qw(@ISA @EXPORT $DEBUG $GIZAHOME);
- # @ISA = qw( Uplug::Align::Word Exporter );
- @ISA = qw( Exporter);
- $DEBUG = 0;
- @EXPORT = qw( &Bitext2Text &RunGiza &Combined2Uplug &Giza2Clue &Giza2Uplug );
- #BEGIN{
- our $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++";
- if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";}
- if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";}
- if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";}
- if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";}
- #}
- # if (not -d $GIZAHOME){warn "cannot find GIZA++!";exit;}
- #----------------------------------------------------------------------------
- # Giza2Clue (new version): no external calls
- # - looks for $dir/GIZA++.actual.ti.final (lexical prob's from GIZA)
- # - creates data/runtime/giza.dbm
- # - creates data/runtime/giza2.dbm (inverse alignments)
- sub Giza2Clue{
- my $file=shift;
- my $param=shift;
- my $inverse=shift;
- my $ClueDB=shift;
- my $threshold=shift;
- if (-d $file){ # if file is a directory
- $file="$file/GIZA++.actual.ti.final"; # look for the standard file in
- } # this directory!
- my %dic;
- if (ref($ClueDB) eq 'HASH'){
- %dic=%{$ClueDB};
- }
- else{
- %dic=('format' => 'dbm',
- 'write_mode' => 'overwrite',
- 'key' => ['source','target']);
- my $cluedir='data/runtime';
- if ($inverse){$dic{file}="$cluedir/giza2.dbm";}
- else{$dic{file}="$cluedir/giza.dbm";}
- }
- my %inStream=('file' => $file,
- 'format' => 'tab',
- 'field delimiter' => ' ');
- if ($inverse){
- $inStream{'columns'}=['source','target','value',],
- }
- else{
- $inStream{'columns'}=['target','source','value',],
- }
- my %lex=();
- my $data=Uplug::Data->new;
- my $in=Uplug::IO::Any->new(\%inStream);
- $in->open('read',\%inStream);
- my $count=0;
- while ($in->read($data)){
- $count++;
- if (not ($count % 1000)){print STDERR '.';}
- if (not ($count % 10000)){print STDERR "$count\n";}
- my $src=$data->attribute('source');
- my $trg=$data->attribute('target');
- if ((not $src) or (not $trg)){next;}
- my $value=$data->attribute('value');
- if (not $value){$value=1;}
- if ((defined $threshold) and ($value<$threshold)){next;}
- $lex{$src}{$trg}=$value;
- if (($src=~s/\_/ /gs) or ($trg=~s/\_/ /gs)){ # (for giza-clue:)
- $lex{$src}{$trg}=$value; # '_' means ' '
- }
- }
- my $header=$in->header;
- my $out=Uplug::IO::Any->new(\%dic);
- $out->open('write',\%dic);
- $out->addheader($header);
- $out->addheader($param);
- $out->writeheader();
- foreach my $s (keys %lex){
- my $total;
- foreach my $t (keys %{$lex{$s}}){
- my $score=$lex{$s}{$t};
- my $data=Uplug::Data->new;
- $data->setAttribute('source',$s);
- $data->setAttribute('target',$t);
- $data->setAttribute('score',$score);
- $out->write($data);
- }
- }
- $out->close;
- $in->close;
- }
- #----------------------------------------------------------------------------
- # Giza2Uplug: convert GIZA's Viterbi alignment to Uplug format (XML)
- # (slow and risky: GIZA's output must be complete and use a certain format)
- sub Giza2Uplug{
- my $viterbi=shift;
- my $bitext=shift;
- my $param=shift;
- my $links=shift;
- my $inverse=shift;
- if (ref($links) ne 'HASH'){return 0;}
- my $input=Uplug::IO::Any->new($bitext);
- if (not ref($input)){return 0;}
- if (not $input->open('read',$bitext)){return 0;}
- my $BitextHeader=$input->header();
- my $output=Uplug::IO::Any->new($links);
- if (not ref($output)){return 0;}
- $output->addheader($BitextHeader);
- if (not $output->open('write',$links)){return 0;}
- $output->setOption('SkipDataHeader',0); # don't skip headers and
- $output->setOption('SkipDataTail',0); # footers (e.g. <linkGrp>)
- #------------------------------------------------------------------------
- if ($viterbi=~/\.gz$/){
- open F,"gzip -cd <$viterbi |" ||
- die "cannot open Viterbi alignment file $viterbi!";
- }
- else{
- open F,"<$viterbi" ||
- die "cannot open Viterbi alignment file $viterbi!";
- }
- #------------------------------------------------------------------------
- my $TokenLabel='w';
- my $data=Uplug::Data::Align->new();
- print STDERR "convert GIZA's Viterbi alignment to XML!\n";
- my $count=0;
- while ($input->read($data)){
- $count++;
- if (not ($count % 100)){
- $|=1;print STDERR '.';$|=0;
- }
- if (not ($count % 1000)){
- $|=1;print STDERR "$count\n";$|=0;
- }
- #----------------------------------
- # do the same as for Bitext2Text!!
- # (to check for empty strings ...)
- #
- my @SrcNodes=();
- my @TrgNodes=();
- my ($srctxt,$trgtxt)=
- &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
- if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
- #----------------------------------
- # my $SrcData=$data->sourceData();
- # my $TrgData=$data->targetData();
- #
- # my @SrcNodes=$SrcData->findNodes($TokenLabel);
- my @SrcIds=$data->attribute(\@SrcNodes,'id');
- my @SrcSpans=$data->attribute(\@SrcNodes,'span');
- # my @SrcTokens=$data->content(\@SrcNodes);
- my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes);
- # my @TrgNodes=$TrgData->findNodes($TokenLabel);
- my @TrgIds=$data->attribute(\@TrgNodes,'id');
- my @TrgSpans=$data->attribute(\@TrgNodes,'span');
- # my @TrgTokens=$data->content(\@TrgNodes);
- my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes);
- if ((not @SrcNodes) or (not @TrgNodes)){next;}
- $_=<F>;
- $_=<F>;
- chomp;
- my @src=split(/ /);
- $_=<F>;
- chomp;
- my %align=();
- my $count=1;
- while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
- if ($2){push (@{$align{$2}},$count);}
- $count++;
- }
- foreach (sort keys %align){
- my @s;my @t;
- if ($inverse){
- @t=@{$align{$_}};
- @s=split(/\s/);
- }
- else{
- @s=@{$align{$_}};
- @t=split(/\s/);
- }
- my @src=();my @trg=();
- foreach (@s){push (@src,$SrcTokens[$_-1]);}
- foreach (@t){push (@trg,$TrgTokens[$_-1]);}
- my @srcId=();my @trgId=();
- foreach (@s){push (@srcId,$SrcIds[$_-1]);}
- foreach (@t){push (@trgId,$TrgIds[$_-1]);}
- my @srcSpan=();my @trgSpan=();
- foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
- foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}
- my %link=();
- $link{link}=join ' ',@src;
- $link{link}.=';';
- $link{link}.=join ' ',@trg;
- $link{source}=join '+',@srcId;
- $link{target}=join '+',@trgId;
- $link{src}=join '&',@srcSpan;
- $link{trg}=join '&',@trgSpan;
- $data->addWordLink(\%link);
- }
- $output->write($data);
- }
- $input->close;
- $output->close;
- }
- #----------------------------------------------------------------------------
- # Combined2Uplug: combine GIZA's Viterbi alignment and convert them to Uplug format (XML)
- # (slow and risky: GIZA's output must be complete and must use a certain format)
- #
- # possible combinatins: union, intersection, refined
- #
- sub Combined2Uplug{
- my $giza0=shift;
- my $giza1=shift;
- my $combine=shift;
- my $bitext=shift;
- my $param=shift;
- my $links=shift;
- if (ref($links) ne 'HASH'){return 0;}
- my $input=Uplug::IO::Any->new($bitext);
- if (not ref($input)){return 0;}
- if (not $input->open('read',$bitext)){return 0;}
- my $output; # open output later (after reading the first record of
- # the input stream --> this gives the complete input header)
- #------------------------------------------------------------------------
- if ($giza0=~/\.gz$/){open F0,"gzip -cd <$giza0 |";}
- else{open F0,"<$giza0";}
- # ($]>=5.008){binmode(F0, ":encoding(utf-8)");}
- if ($giza1=~/\.gz$/){open F1,"gzip -cd <$giza1 |";}
- else{open F1,"<$giza1";}
- # ($]>=5.008){binmode(F1, ":encoding(utf-8)");}
- #------------------------------------------------------------------------
- my $TokenLabel='w';
- my $data=Uplug::Data::Align->new();
- print STDERR "combine GIZA's Viterbi alignments and convert to XML!\n";
- my $count=0;
- while ($input->read($data)){
- if (not $output){ # output is not opened yet:
- my $BitextHeader=$input->header(); # - get the input header
- $output=Uplug::IO::Any->new($links); # - create an output stream
- if (not ref($output)){return 0;} # (or die)
- $output->addheader($BitextHeader); # - add input header
- if (not $output->open('write',$links)){ # - open the output stream
- return 0; # (or die)
- }
- $output->setOption('SkipDataHeader',0); # don't skip headers and
- $output->setOption('SkipDataTail',0); # footers (e.g. <linkGrp>)
- }
- $count++;
- if (not ($count % 100)){
- $|=1;print STDERR '.';$|=0;
- }
- if (not ($count % 1000)){
- $|=1;print STDERR "$count\n";$|=0;
- }
- #----------------------------------
- # do the same as for Bitext2Text!!
- # (to check for empty strings ...)
- #
- my @SrcNodes=();
- my @TrgNodes=();
- my ($srctxt,$trgtxt)=
- &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
- if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
- #----------------------------------
- # my @SrcNodes=$SrcData->findNodes($TokenLabel);
- my @SrcIds=$data->attribute(\@SrcNodes,'id');
- my @SrcSpans=$data->attribute(\@SrcNodes,'span');
- # my @SrcTokens=$data->content(\@SrcNodes);
- my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes);
- # my @TrgNodes=$TrgData->findNodes($TokenLabel);
- my @TrgIds=$data->attribute(\@TrgNodes,'id');
- my @TrgSpans=$data->attribute(\@TrgNodes,'span');
- # my @TrgTokens=$data->content(\@TrgNodes);
- my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes);
- if ((not @SrcNodes) or (not @TrgNodes)){next;}
- $_=<F1>;$_=<F1>;chomp; # read source->target viterbi alignment
- my @src=split(/ /);
- $_=<F1>;chomp;
- my %srclinks=();
- my $count=1;
- while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
- my @s=split(/\s/,$2);
- foreach (@s){$srclinks{$_}{$count}=1;}
- $count++;
- }
- $_=<F0>;$_=<F0>;chomp; # read source->target viterbi alignment
- my @trg=split(/ /);
- $_=<F0>;chomp;
- my %trglinks=();
- my $count=1;
- while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!!
- my @t=split(/\s/,$2);
- foreach (@t){$trglinks{$_}{$count}=1;}
- $count++;
- }
- my (%CombinedSrc,%CombinedTrg);
- &CombineLinks(\%srclinks,\%trglinks,$combine,\%CombinedSrc,\%CombinedTrg);
- my @cluster=&LinkClusters(\%CombinedSrc,\%CombinedTrg);
- foreach my $c (@cluster){
- # my @s=sort {$a <=> $b} keys %{$cluster[$_]{src}};
- # my @t=sort {$a <=> $b} keys %{$cluster[$_]{trg}};
- my @s=@{$$c{src}};
- my @t=@{$$c{trg}};
- my @src=();my @trg=();
- foreach (@s){push (@src,$SrcTokens[$_-1]);}
- foreach (@t){push (@trg,$TrgTokens[$_-1]);}
- my @srcId=();my @trgId=();
- foreach (@s){push (@srcId,$SrcIds[$_-1]);}
- foreach (@t){push (@trgId,$TrgIds[$_-1]);}
- my @srcSpan=();my @trgSpan=();
- foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
- foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}
- my %link=();
- $link{link}=join ' ',@src;
- $link{link}.=';';
- $link{link}.=join ' ',@trg;
- $link{source}=join '+',@srcId;
- $link{target}=join '+',@trgId;
- $link{src}=join '&',@srcSpan;
- $link{trg}=join '&',@trgSpan;
- $data->addWordLink(\%link);
- }
- $output->write($data);
- }
- $input->close;
- $output->close;
- }
- sub LinkClusters{
- my ($src,$trg)=@_;
- my @cluster=();
- while (keys %{$src}){
- my ($s,$links)=each %{$src}; # get the next source token
- if ((ref($$src{$s}) ne 'HASH') or
- (not keys %{$$src{$s}})){ # if no links exist:
- delete $$src{$s}; # delete and next!
- next;
- }
- push (@cluster,{src=>[],trg=>[]}); # create a new link cluster
- push (@{$cluster[-1]{src}},$s); # and save it in the cluster
- &AddLinks($cluster[-1],$src,$trg,$s, # add all tokens aligned to the
- 'src','trg'); # source token to the cluster
- } # (and recursively the ones
- foreach my $c (@cluster){
- @{$$c{src}}=sort {$a <=> $b} @{$$c{src}};
- @{$$c{trg}}=sort {$a <=> $b} @{$$c{trg}};
- }
- return @cluster;
- } # linked to them, see AddLinks)
- sub AddLinks{
- my ($cluster,$src,$trg,$s,$key1,$key2)=@_;
- foreach my $t (keys %{$$src{$s}}){ # add all linked tokens to the
- delete $$src{$s}{$t}; # cluster and delete the links
- delete $$trg{$t}{$s}; # in the link-hashs
- push (@{$$cluster{$key2}},$t);
- &AddLinks($cluster,$trg,$src,$t,$key2,$key1); # add tokens aligned to the
- } # linked token to the cluster
- delete $$src{$s}; # delete the source token link hash
- }
- sub CombineLinks{
- my ($src,$trg,$method,$srclinks,$trglinks)=@_;
- # my %srclinks;
- # my %trglinks;
- if ($method eq 'union'){
- foreach my $s (keys %{$src}){
- foreach my $t (keys %{$$src{$s}}){
- $$srclinks{$s}{$t}=1;
- $$trglinks{$t}{$s}=1;
- }
- }
- foreach my $t (keys %{$trg}){
- foreach my $s (keys %{$$trg{$t}}){
- $$srclinks{$s}{$t}=1;
- $$trglinks{$t}{$s}=1;
- }
- }
- }
- elsif ($method eq 'intersection'){
- foreach my $s (keys %{$src}){
- foreach my $t (keys %{$$src{$s}}){
- if (exists $$trg{$t}{$s}){
- $$srclinks{$s}{$t}=1;
- $$trglinks{$t}{$s}=1;
- }
- }
- }
- }
- if ($method eq 'refined'){ # refined combination:
- my %links=();
- foreach my $s (keys %{$src}){ # 1) start with intersection
- foreach my $t (keys %{$$src{$s}}){
- if (exists $$trg{$t}{$s}){
- $$srclinks{$s}{$t}=1;
- $$trglinks{$t}{$s}=1;
- }
- else{
- $links{"$s:$t"}=1; # keep union of links
- }
- }
- }
- foreach my $t (keys %{$trg}){
- foreach my $s (keys %{$$trg{$t}}){
- if (not exists $$src{$s}{$t}){
- $links{"$s:$t"}=1;
- }
- }
- }
- add_unaligned(\%links,$srclinks,$trglinks); # 2) add unaligned pairs
- add_adjacent(\%links,$srclinks,$trglinks); # 3) add adjacent links
- }
- # $src=\%srclinks;
- # $trg=\%trglinks;
- }
- sub is_diagonal{
- my ($s,$t,$srclinks,$trglinks)=@_;
- if (defined $$srclinks{$s-1}){
- return 1 if (defined $$trglinks{$t-1});
- return 1 if (defined $$trglinks{$t+1});
- }
- if (defined $$srclinks{$s+1}){
- return 1 if (defined $$trglinks{$t-1});
- return 1 if (defined $$trglinks{$t+1});
- }
- return 0;
- }
- sub is_adjacent{
- my ($s,$t,$srclinks,$trglinks)=@_;
- if (exists $$srclinks{$s}){
- return 1 if (exists $$srclinks{$s}{$t-1});
- return 1 if (exists $$srclinks{$s}{$t+1});
- }
- elsif (exists $$trglinks{$t}){
- return 1 if (exists $$trglinks{$t}{$s-1});
- return 1 if (exists $$trglinks{$t}{$s+1});
- }
- return 0;
- }
- sub add_adjacent{
- my $links=shift;
- my $srclinks=shift;
- my $trglinks=shift;
- my $add=1;
- while (%{$links} && $add){
- $add=0;
- foreach my $l (keys %{$links}){
- my ($l) = each %{$links};
- my ($s,$t) = split(/:/,$l);
- next if (exists $$srclinks{$s} && exists $$trglinks{$t});
- if (is_adjacent($s,$t,$srclinks,$trglinks)){
- $$srclinks{$s}{$t}=1;
- $$trglinks{$t}{$s}=1;
- delete $$links{$l};
- $add++;
- }
- }
- }
- }
- sub is_unaligned{
- my ($s,$t,$srclinks,$trglinks)=@_;
- if (not exists $$srclinks{$s}){
- return 1 if (not exists $$trglinks{$t});
- }
- return 0;
- }
- sub add_unaligned{
- my $links=shift;
- my $srclinks=shift;
- my $trglinks=shift;
- foreach my $l (keys %{$links}){
- my ($s,$t) = split(/:/,$l);
- if (is_unaligned($s,$t,$srclinks,$trglinks)){
- $$srclinks{$s}{$t}=1;
- $$trglinks{$t}{$s}=1;
- delete $$links{$l};
- }
- }
- }
- sub add_adjacent_old{
- my $links=shift;
- my $srclinks=shift;
- my $trglinks=shift;
- foreach my $s (0..$#{$links}){
- foreach my $t (0..$#{$$links[$s]}){
- next if (not $$links[$s][$t]);
- if ((not defined $$srclinks{$s}) and
- (not defined $$trglinks{$t})){ # - if both are not aligned yet:
- $$srclinks{$s}{$t}=1; # add the link
- $$trglinks{$t}{$s}=1;
- }
- elsif ((defined $$srclinks{$s-1}) or
- (defined $$srclinks{$s+1})){
- if (($$srclinks{$s-1}{$t}) or # if the link is adjacent to
- ($$srclinks{$s+1}{$t})){ # another one horizontally:
- if ($$srclinks{$s}{$t+1}){next;} # do not accept if it is also
- if ($$srclinks{$s}{$t-1}){next;} # adjacent to other links vertically
- if ($$srclinks{$s-1}{$t}){ # do not accept if the adjacent
- if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other
- if ($$srclinks{$s-1}{$t+1}){next;} # links vertically
- }
- if ($$srclinks{$s+1}{$t}){ # the same for the other
- if ($$srclinks{$s+1}{$t-1}){next;} # adjacency direction
- if ($$srclinks{$s+1}{$t+1}){next;}
- }
- $$srclinks{$s}{$t}=1; # everything ok: add the link
- $$trglinks{$t}{$s}=1;
- }
- }
- elsif ((defined $$trglinks{$t-1}) or
- (defined $$trglinks{$t+1})){
- if (($$srclinks{$s}{$t-1}) or # if the link is adjacent to
- ($$srclinks{$s}{$t+1})){ # another one vertically:
- if ($$srclinks{$s+1}{$t}){next;} # do not accept if it is also
- if ($$srclinks{$s-1}{$t}){next;} # adjacent to other links horizontally
- if ($$srclinks{$s}{$t-1}){ # do not accept if the adjacent
- if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other
- if ($$srclinks{$s+1}{$t-1}){next;} # links horizontally
- }
- if ($$srclinks{$s}{$t+1}){ # the same for the other
- if ($$srclinks{$s-1}{$t+1}){next;} # adjacency direction
- if ($$srclinks{$s+1}{$t+1}){next;}
- }
- $$srclinks{$s}{$t}=1; # everything ok: add the link
- $$trglinks{$t}{$s}=1;
- }
- }
- }
- }
- }
- #----------------------------------------------------------------------------
- # RunGiza: run GIZA++ using external scripts
- # (GIZA must be installed in the given directory)
- sub RunGiza{
- my $src=shift;
- my $trg=shift;
- my $viterbi=shift;
- if (not -d $GIZAHOME){
- $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++";
- if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";}
- if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";}
- if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";}
- if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";}
- if (not -d $GIZAHOME){
- warn "cannot find GIZA++ in $GIZAHOME! Set ENV{UPLUGHOME} or Uplug::Align::Word::Giza::GIZAHOME!";
- return 0;
- }
- }
- if (my $sig=system "$GIZAHOME/plain2snt.out $src $trg"){
- die "got signal $? from plain2snt!\n";
- }
- my $command="PATH=\$\{PATH\}:$GIZAHOME;";
- my $snt="$src$trg\.snt";
- if (not -e $snt){$snt="$src\_$trg\.snt";}
- if (not -e $snt){die "cannot find alignment-file: $snt!\n";}
- $command.="$GIZAHOME/trainGIZA++.sh $src\.vcb $trg\.vcb $snt";
- if (my $sig=system $command){
- die "got signal $? from trainGIZA++.sh!\n";
- }
- if ($viterbi){copy ('GIZA++.A3.final',$viterbi);}
- else{copy ('GIZA++.A3.final','viterbi');}
- }
- #----------------------------------------------------------------------------
- # Bitext2Text: convert bitexts from Uplug format (XML) to GIZA's format
- # (this is much too slow ....)
- sub Bitext2Text{
- my $bitext=shift;
- my $srcfile=shift;
- my $trgfile=shift;
- my $param=shift;
- my %SrcStream=('format'=>'text','file'=>$srcfile);
- my %TrgStream=('format'=>'text','file'=>$trgfile);
- my $input=Uplug::IO::Any->new($bitext);
- my $source=Uplug::IO::Any->new(\%SrcStream);
- my $target=Uplug::IO::Any->new(\%TrgStream);
- $input->open('read',$bitext)
- || warn "cannot open the bitext" && return 0;
- $source->open('write',\%SrcStream)
- || warn "cannot write to $srcfile" && return 0;
- $target->open('write',\%TrgStream)
- || warn "cannot write to $trgfile" && return 0;
- #-------------------------------------------------------------------------
- my $data=Uplug::Data::Align->new();
- print STDERR "convert bitext to plain text!\n";
- my $count=0;
- while ($input->read($data)){
- $count++;
- if (not ($count % 100)){
- $|=1;print STDERR '.';$|=0;
- }
- if (not ($count % 1000)){
- $|=1;print STDERR "$count\n";$|=0;
- }
- my ($srctxt,$trgtxt)=&BitextStrings($data,$param);
- if (($srctxt=~/\S/) and ($trgtxt=~/\S/)){
- $source->write($srctxt);
- $target->write($trgtxt);
- }
- }
- # $BitextHeader=$input->header;
- $input->close;
- $source->close;
- $target->close;
- return $input->header;
- }
- #----------------------------------------------------------------------------
- # get the actual strings from the bitext (using feature-parameters)
- # (feature specifications as in coocfreq.pl)
- sub BitextStrings{
- my $data=shift;
- my $param=shift;
- my ($srcnodes,$trgnodes)=@_;
- my @srctok=$data->getSrcTokenFeatures($param,$srcnodes);
- my @trgtok=$data->getTrgTokenFeatures($param,$trgnodes);
- map($_=~s/^\s+//sg,@srctok); # delete initial white-space
- map($_=~s/^\s+//sg,@trgtok);
- map($_=~s/(\S)\s+$/$1/sg,@srctok); # delete final white-space
- map($_=~s/(\S)\s+$/$1/sg,@trgtok);
- map($_=~s/\n/ /sg,@srctok); # otherwise: convert to space
- map($_=~s/\n/ /sg,@trgtok);
- map($_=~s/\s/\_/sg,@srctok); # and replace space with underline
- map($_=~s/\s/\_/sg,@trgtok); # (to avoid extra tokens)
-
- my $srctxt=join(' ',@srctok);
- my $trgtxt=join(' ',@trgtok);
- $srctxt=~tr/\n/ /;
- $trgtxt=~tr/\n/ /;
- return ($srctxt,$trgtxt);
- }
- #----------------------------------------------------------------------------
- =pod
- =head1 Synopsis
- use lib '/path/to/uplug';
- use Uplug::Align::Word::Giza;
- $ENV{UPLUGHOME}='/path/to/uplug';
- my %bitext = ('file' => 'svenprf.xces',
- 'format' => 'xces align');
- &Bitext2Text(\%bitext,'src','trg',{}); # convert to plain text
- &RunGiza('src','trg','viterbi.src-trg'); # run GIZA++ (src-->trg)
- &RunGiza('trg','src','viterbi.trg-src'); # run GIZA++ (trg-->src)
- my %dbm = (file=>'clues.dbm',
- format=>'dbm',
- key => ['source','target']);
- &Giza2Clue('.', # directory where GIZA was running (=current)
- {}, # parameter (= clue dbm header)
- 1, # =1 --> inverse (trg-src)
- \%dbm); # clue DBM (giza.dbm if not specified)
- my $combine = 'intersection' # combine heuristics (union|refined)
- my %out = ('file' => $combined, # save the result in this file
- 'format' => 'xces align');
- &Combined2Uplug('viterbi.src-trg', # name of first viterbi alignment
- 'viterbi.trg-src', # name of second viterbi alignment
- $combine, # type of combination heuristics
- \%bitext, # bitext
- {}, # token parameters
- \%out); # output stream
- =head1 GIZA++ directories
- You have to tell the program where it can find the GIZA++ executables.
- You can either set the UPLUGHOME environment variable or the GIZAHOME variable.
- $ENV{UPLUGHOME}='/path/to/uplug';
- $Uplug::Align::Word::Giza::GIZAHOME="/path/to/uplug/ext/GIZA++";
- =cut
- 1;