Giza.pm - Giza2Clue (new version): no external calls

/uplug-main/lib/Uplug/Align/Word/Giza.pm

https://bitbucket.org/tiedemann/uplug · Perl · 795 lines · 568 code · 114 blank · 113 comment · 113 complexity · 34c25d8107de0246175b6d12ef0a6418 MD5 · raw file

#-*-perl-*-

package Uplug::Align::Word::Giza;


use strict;

use Cwd;
use Exporter;
use File::Copy;

use Uplug::Data;
use Uplug::Data::Align;
use Uplug::IO::Any;
use Uplug::Config;

use vars qw(@ISA @EXPORT $DEBUG $GIZAHOME);

# @ISA = qw( Uplug::Align::Word Exporter );
@ISA = qw( Exporter);
$DEBUG = 0;

@EXPORT = qw( &Bitext2Text &RunGiza &Combined2Uplug &Giza2Clue &Giza2Uplug );

#BEGIN{
    our $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++";
    if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";}
    if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";}
    if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";}
    if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";}
#}

# if (not -d $GIZAHOME){warn "cannot find GIZA++!";exit;}


#----------------------------------------------------------------------------
# Giza2Clue (new version): no external calls
#  - looks for $dir/GIZA++.actual.ti.final (lexical prob's from GIZA)
#  - creates data/runtime/giza.dbm
#  - creates data/runtime/giza2.dbm (inverse alignments)

sub Giza2Clue{
    my $file=shift;
    my $param=shift;
    my $inverse=shift;
    my $ClueDB=shift;
    my $threshold=shift;

    if (-d $file){                             # if file is a directory
	$file="$file/GIZA++.actual.ti.final";  # look for the standard file in
    }                                          # this directory!

    my %dic;
    if (ref($ClueDB) eq 'HASH'){
	%dic=%{$ClueDB};
    }
    else{
	%dic=('format' => 'dbm',
	      'write_mode' => 'overwrite',
	      'key' => ['source','target']);
	my $cluedir='data/runtime';
	if ($inverse){$dic{file}="$cluedir/giza2.dbm";}
	else{$dic{file}="$cluedir/giza.dbm";}
    }

    my %inStream=('file' => $file,
		  'format' => 'tab',
		  'field delimiter' => ' ');
    if ($inverse){
	$inStream{'columns'}=['source','target','value',],
    }
    else{
	$inStream{'columns'}=['target','source','value',],
    }


    my %lex=();
    my $data=Uplug::Data->new;

    my $in=Uplug::IO::Any->new(\%inStream);
    $in->open('read',\%inStream);
    my $count=0;
    while ($in->read($data)){
	$count++;
	if (not ($count % 1000)){print STDERR '.';}
	if (not ($count % 10000)){print STDERR "$count\n";}
	my $src=$data->attribute('source');
	my $trg=$data->attribute('target');
	if ((not $src) or (not $trg)){next;}
	my $value=$data->attribute('value');
	if (not $value){$value=1;}
	if ((defined $threshold) and ($value<$threshold)){next;}
	$lex{$src}{$trg}=$value;
	if (($src=~s/\_/ /gs) or ($trg=~s/\_/ /gs)){ # (for giza-clue:)
	    $lex{$src}{$trg}=$value;                 #   '_' means ' '
	}
    }
    my $header=$in->header;

    my $out=Uplug::IO::Any->new(\%dic);
    $out->open('write',\%dic);
    $out->addheader($header);
    $out->addheader($param);
    $out->writeheader();

    foreach my $s (keys %lex){
	my $total;
	foreach my $t (keys %{$lex{$s}}){
	    my $score=$lex{$s}{$t};
	    my $data=Uplug::Data->new;
	    $data->setAttribute('source',$s);
	    $data->setAttribute('target',$t);
	    $data->setAttribute('score',$score);
	    $out->write($data);
	}
    }

    $out->close;
    $in->close;
}

#----------------------------------------------------------------------------
# Giza2Uplug: convert GIZA's Viterbi alignment to Uplug format (XML)
# (slow and risky: GIZA's output must be complete and use a certain format)

sub Giza2Uplug{
    my $viterbi=shift;
    my $bitext=shift;
    my $param=shift;
    my $links=shift;
    my $inverse=shift;

    if (ref($links) ne 'HASH'){return 0;}
    my $input=Uplug::IO::Any->new($bitext);
    if (not ref($input)){return 0;}
    if (not $input->open('read',$bitext)){return 0;}
    my $BitextHeader=$input->header();
    my $output=Uplug::IO::Any->new($links);
    if (not ref($output)){return 0;}
    $output->addheader($BitextHeader);
    if (not $output->open('write',$links)){return 0;}
    $output->setOption('SkipDataHeader',0); # don't skip headers and
    $output->setOption('SkipDataTail',0);   # footers (e.g. <linkGrp>)


    #------------------------------------------------------------------------
    if ($viterbi=~/\.gz$/){
	open F,"gzip -cd <$viterbi |" || 
	    die "cannot open Viterbi alignment file $viterbi!";
    }
    else{
	open F,"<$viterbi" || 
	    die "cannot open Viterbi alignment file $viterbi!";
    }
    #------------------------------------------------------------------------

    my $TokenLabel='w';
    my $data=Uplug::Data::Align->new();
    print STDERR "convert GIZA's Viterbi alignment to XML!\n";
    my $count=0;

    while ($input->read($data)){

	$count++;
	if (not ($count % 100)){
	    $|=1;print STDERR '.';$|=0;
	}
	if (not ($count % 1000)){
	    $|=1;print STDERR "$count\n";$|=0;
	}

	#----------------------------------
	# do the same as for Bitext2Text!!
	# (to check for empty strings ...)
	#
	my @SrcNodes=();
	my @TrgNodes=();
	my ($srctxt,$trgtxt)=
	    &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
	if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
	#----------------------------------

#	my $SrcData=$data->sourceData();
#	my $TrgData=$data->targetData();
#
#	my @SrcNodes=$SrcData->findNodes($TokenLabel);
	my @SrcIds=$data->attribute(\@SrcNodes,'id');
	my @SrcSpans=$data->attribute(\@SrcNodes,'span');
#	my @SrcTokens=$data->content(\@SrcNodes);
	my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes);

#	my @TrgNodes=$TrgData->findNodes($TokenLabel);
	my @TrgIds=$data->attribute(\@TrgNodes,'id');
	my @TrgSpans=$data->attribute(\@TrgNodes,'span');
#	my @TrgTokens=$data->content(\@TrgNodes);
	my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes);

	if ((not @SrcNodes) or (not @TrgNodes)){next;}

	$_=<F>;
	$_=<F>;
	chomp;
	my @src=split(/ /);
	$_=<F>;
	chomp;

	my %align=();
	my $count=1;
	while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){     # strunta i NULL!!
	    if ($2){push (@{$align{$2}},$count);}
	    $count++;
	}
	foreach (sort keys %align){
	    my @s;my @t;
	    if ($inverse){
		@t=@{$align{$_}};
		@s=split(/\s/);
	    }
	    else{
		@s=@{$align{$_}};
		@t=split(/\s/);
	    }

	    my @src=();my @trg=();
	    foreach (@s){push (@src,$SrcTokens[$_-1]);}
	    foreach (@t){push (@trg,$TrgTokens[$_-1]);}
	    my @srcId=();my @trgId=();
	    foreach (@s){push (@srcId,$SrcIds[$_-1]);}
	    foreach (@t){push (@trgId,$TrgIds[$_-1]);}
	    my @srcSpan=();my @trgSpan=();
	    foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
	    foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}

	    my %link=();
	    $link{link}=join ' ',@src;
	    $link{link}.=';';
	    $link{link}.=join ' ',@trg;
	    $link{source}=join '+',@srcId;
	    $link{target}=join '+',@trgId;
	    $link{src}=join '&',@srcSpan;
	    $link{trg}=join '&',@trgSpan;

	    $data->addWordLink(\%link);
	}

	$output->write($data);
    }
    $input->close;
    $output->close;
}


#----------------------------------------------------------------------------
# Combined2Uplug: combine GIZA's Viterbi alignment and convert them to Uplug format (XML)
# (slow and risky: GIZA's output must be complete and must use a certain format)
#
# possible combinatins: union, intersection, refined
#

sub Combined2Uplug{
    my $giza0=shift;
    my $giza1=shift;
    my $combine=shift;
    my $bitext=shift;
    my $param=shift;
    my $links=shift;

    if (ref($links) ne 'HASH'){return 0;}
    my $input=Uplug::IO::Any->new($bitext);
    if (not ref($input)){return 0;}
    if (not $input->open('read',$bitext)){return 0;}
    my $output;   # open output later (after reading the first record of
                  # the input stream --> this gives the complete input header)

    #------------------------------------------------------------------------
    if ($giza0=~/\.gz$/){open F0,"gzip -cd <$giza0 |";}
    else{open F0,"<$giza0";}
#    ($]>=5.008){binmode(F0, ":encoding(utf-8)");}

    if ($giza1=~/\.gz$/){open F1,"gzip -cd <$giza1 |";}
    else{open F1,"<$giza1";}
#    ($]>=5.008){binmode(F1, ":encoding(utf-8)");}
    #------------------------------------------------------------------------

    my $TokenLabel='w';
    my $data=Uplug::Data::Align->new();
    print STDERR "combine GIZA's Viterbi alignments and convert to XML!\n";
    my $count=0;

    while ($input->read($data)){

	if (not $output){                           # output is not opened yet:
	    my $BitextHeader=$input->header();      # - get the input header
	    $output=Uplug::IO::Any->new($links);    # - create an output stream
	    if (not ref($output)){return 0;}        #   (or die)
	    $output->addheader($BitextHeader);      # - add input header
	    if (not $output->open('write',$links)){ # - open the output stream
		return 0;                           #   (or die)
	    }
	    $output->setOption('SkipDataHeader',0); # don't skip headers and
	    $output->setOption('SkipDataTail',0);   # footers (e.g. <linkGrp>)
	}

	$count++;
	if (not ($count % 100)){
	    $|=1;print STDERR '.';$|=0;
	}
	if (not ($count % 1000)){
	    $|=1;print STDERR "$count\n";$|=0;
	}

	#----------------------------------
	# do the same as for Bitext2Text!!
	# (to check for empty strings ...)
	#
	my @SrcNodes=();
	my @TrgNodes=();
	my ($srctxt,$trgtxt)=
	    &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes);
	if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;}
	#----------------------------------

#	my @SrcNodes=$SrcData->findNodes($TokenLabel);
	my @SrcIds=$data->attribute(\@SrcNodes,'id');
	my @SrcSpans=$data->attribute(\@SrcNodes,'span');
#	my @SrcTokens=$data->content(\@SrcNodes);
	my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes);

#	my @TrgNodes=$TrgData->findNodes($TokenLabel);
	my @TrgIds=$data->attribute(\@TrgNodes,'id');
	my @TrgSpans=$data->attribute(\@TrgNodes,'span');
#	my @TrgTokens=$data->content(\@TrgNodes);
	my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes);

	if ((not @SrcNodes) or (not @TrgNodes)){next;}

	$_=<F1>;$_=<F1>;chomp;    # read source->target viterbi alignment
	my @src=split(/ /);
	$_=<F1>;chomp;
	my %srclinks=();
	my $count=1;
	while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){     # strunta i NULL!!
	    my @s=split(/\s/,$2);
	    foreach (@s){$srclinks{$_}{$count}=1;}
	    $count++;
	}


	$_=<F0>;$_=<F0>;chomp;    # read source->target viterbi alignment
	my @trg=split(/ /);
	$_=<F0>;chomp;
	my %trglinks=();
	my $count=1;
	while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){     # strunta i NULL!!
	    my @t=split(/\s/,$2);
	    foreach (@t){$trglinks{$_}{$count}=1;}
	    $count++;
	}

	my (%CombinedSrc,%CombinedTrg);
	&CombineLinks(\%srclinks,\%trglinks,$combine,\%CombinedSrc,\%CombinedTrg);
	my @cluster=&LinkClusters(\%CombinedSrc,\%CombinedTrg);

	foreach my $c (@cluster){
#	    my @s=sort {$a <=> $b} keys %{$cluster[$_]{src}};
#	    my @t=sort {$a <=> $b} keys %{$cluster[$_]{trg}};

	    my @s=@{$$c{src}};
	    my @t=@{$$c{trg}};

	    my @src=();my @trg=();
	    foreach (@s){push (@src,$SrcTokens[$_-1]);}
	    foreach (@t){push (@trg,$TrgTokens[$_-1]);}
	    my @srcId=();my @trgId=();
	    foreach (@s){push (@srcId,$SrcIds[$_-1]);}
	    foreach (@t){push (@trgId,$TrgIds[$_-1]);}
	    my @srcSpan=();my @trgSpan=();
	    foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
	    foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}

	    my %link=();
	    $link{link}=join ' ',@src;
	    $link{link}.=';';
	    $link{link}.=join ' ',@trg;
	    $link{source}=join '+',@srcId;
	    $link{target}=join '+',@trgId;
	    $link{src}=join '&',@srcSpan;
	    $link{trg}=join '&',@trgSpan;

	    $data->addWordLink(\%link);
	}

	$output->write($data);
    }
    $input->close;
    $output->close;
}


sub LinkClusters{
    my ($src,$trg)=@_;
    my @cluster=();
    while (keys %{$src}){
	my ($s,$links)=each %{$src};            # get the next source token
	if ((ref($$src{$s}) ne 'HASH') or
	    (not keys %{$$src{$s}})){           # if no links exist:
	    delete $$src{$s};                   # delete and next!
	    next;
	}
	push (@cluster,{src=>[],trg=>[]});      # create a new link cluster
	push (@{$cluster[-1]{src}},$s);         #  and save it in the cluster
	&AddLinks($cluster[-1],$src,$trg,$s,    # add all tokens aligned to the
		  'src','trg');                 #  source token to the cluster
    }                                           #  (and recursively the ones

    foreach my $c (@cluster){
	@{$$c{src}}=sort {$a <=> $b} @{$$c{src}};
	@{$$c{trg}}=sort {$a <=> $b} @{$$c{trg}};
    }
    return @cluster;
}                                               #   linked to them, see AddLinks)

sub AddLinks{
    my ($cluster,$src,$trg,$s,$key1,$key2)=@_;
    foreach my $t (keys %{$$src{$s}}){          # add all linked tokens to the
	delete $$src{$s}{$t};                   # cluster and delete the links
	delete $$trg{$t}{$s};                   # in the link-hashs
	push (@{$$cluster{$key2}},$t);
	&AddLinks($cluster,$trg,$src,$t,$key2,$key1); # add tokens aligned to the
    }                                                 # linked token to the cluster
    delete $$src{$s};                           # delete the source token link hash
}



sub CombineLinks{
    my ($src,$trg,$method,$srclinks,$trglinks)=@_;
#    my %srclinks;
#    my %trglinks;
    if ($method eq 'union'){
	foreach my $s (keys %{$src}){
	    foreach my $t (keys %{$$src{$s}}){
		$$srclinks{$s}{$t}=1;
		$$trglinks{$t}{$s}=1;
	    }
	}
	foreach my $t (keys %{$trg}){
	    foreach my $s (keys %{$$trg{$t}}){
		$$srclinks{$s}{$t}=1;
		$$trglinks{$t}{$s}=1;
	    }
	}
    }
    elsif ($method eq 'intersection'){
	foreach my $s (keys %{$src}){
	    foreach my $t (keys %{$$src{$s}}){
		if (exists $$trg{$t}{$s}){
		    $$srclinks{$s}{$t}=1;
		    $$trglinks{$t}{$s}=1;
		}
	    }
	}
    }
    if ($method eq 'refined'){                   # refined combination:
	my %links=();
	foreach my $s (keys %{$src}){               # 1) start with intersection
	    foreach my $t (keys %{$$src{$s}}){
		if (exists $$trg{$t}{$s}){
		    $$srclinks{$s}{$t}=1;
		    $$trglinks{$t}{$s}=1;
		}
		else{
		    $links{"$s:$t"}=1;              # keep union of links
		}
	    }
	}
	foreach my $t (keys %{$trg}){
	    foreach my $s (keys %{$$trg{$t}}){
		if (not exists $$src{$s}{$t}){
		    $links{"$s:$t"}=1;
		}
	    }
	}
	add_unaligned(\%links,$srclinks,$trglinks); # 2) add unaligned pairs
	add_adjacent(\%links,$srclinks,$trglinks);  # 3) add adjacent links
    }
#    $src=\%srclinks;
#    $trg=\%trglinks;
}


sub is_diagonal{
    my ($s,$t,$srclinks,$trglinks)=@_;
    if (defined $$srclinks{$s-1}){
	return 1 if (defined $$trglinks{$t-1});
	return 1 if (defined $$trglinks{$t+1});
    }
    if (defined $$srclinks{$s+1}){
	return 1 if (defined $$trglinks{$t-1});
	return 1 if (defined $$trglinks{$t+1});
    }
    return 0;
}



sub is_adjacent{
    my ($s,$t,$srclinks,$trglinks)=@_;
    if (exists $$srclinks{$s}){
	return 1 if (exists $$srclinks{$s}{$t-1});
	return 1 if (exists $$srclinks{$s}{$t+1});
    }
    elsif (exists $$trglinks{$t}){
	return 1 if (exists $$trglinks{$t}{$s-1});
	return 1 if (exists $$trglinks{$t}{$s+1});
    }
    return 0;
}


sub add_adjacent{
    my $links=shift;
    my $srclinks=shift;
    my $trglinks=shift;

    my $add=1;
    while (%{$links} && $add){
	$add=0;
	foreach my $l (keys %{$links}){
	    my ($l) = each %{$links};
	    my ($s,$t) = split(/:/,$l);
	    next if (exists $$srclinks{$s} && exists $$trglinks{$t});
	    if (is_adjacent($s,$t,$srclinks,$trglinks)){
		$$srclinks{$s}{$t}=1;
		$$trglinks{$t}{$s}=1;
		delete $$links{$l};
		$add++;
	    }
	}
    }
}



sub is_unaligned{
    my ($s,$t,$srclinks,$trglinks)=@_;
    if (not exists $$srclinks{$s}){
	return 1 if (not exists $$trglinks{$t});
    }
    return 0;
}

sub add_unaligned{
    my $links=shift;
    my $srclinks=shift;
    my $trglinks=shift;
    foreach my $l (keys %{$links}){
	my ($s,$t) = split(/:/,$l);
	if (is_unaligned($s,$t,$srclinks,$trglinks)){
	    $$srclinks{$s}{$t}=1;
	    $$trglinks{$t}{$s}=1;
	    delete $$links{$l};
	}
    }
}




sub add_adjacent_old{
    my $links=shift;
    my $srclinks=shift;
    my $trglinks=shift;
    foreach my $s (0..$#{$links}){
	foreach my $t (0..$#{$$links[$s]}){
	    next if (not $$links[$s][$t]);
	    if ((not defined $$srclinks{$s}) and
		(not defined $$trglinks{$t})){       #   - if both are not aligned yet:
		$$srclinks{$s}{$t}=1;                #     add the link
		$$trglinks{$t}{$s}=1;
	    }
	    elsif ((defined $$srclinks{$s-1}) or
		   (defined $$srclinks{$s+1})){
		if (($$srclinks{$s-1}{$t}) or         # if the link is adjacent to
		    ($$srclinks{$s+1}{$t})){       # another one horizontally:
		    if ($$srclinks{$s}{$t+1}){next;}  # do not accept if it is also
		    if ($$srclinks{$s}{$t-1}){next;}  # adjacent to other links vertically
		    if ($$srclinks{$s-1}{$t}){              # do not accept if the adjacent
			if ($$srclinks{$s-1}{$t-1}){next;}  # link is also adjacent to other
			if ($$srclinks{$s-1}{$t+1}){next;}  # links vertically
		    }
		    if ($$srclinks{$s+1}{$t}){              # the same for the other
			if ($$srclinks{$s+1}{$t-1}){next;}  # adjacency direction
			if ($$srclinks{$s+1}{$t+1}){next;}
		    }
		    $$srclinks{$s}{$t}=1;        # everything ok: add the link
		    $$trglinks{$t}{$s}=1;
		}
	    }
	    elsif ((defined $$trglinks{$t-1}) or
		   (defined $$trglinks{$t+1})){
		if (($$srclinks{$s}{$t-1}) or         # if the link is adjacent to
		    ($$srclinks{$s}{$t+1})){          # another one vertically:
		    if ($$srclinks{$s+1}{$t}){next;}  # do not accept if it is also
		    if ($$srclinks{$s-1}{$t}){next;}  # adjacent to other links horizontally
		    if ($$srclinks{$s}{$t-1}){              # do not accept if the adjacent
			if ($$srclinks{$s-1}{$t-1}){next;}  # link is also adjacent to other
			if ($$srclinks{$s+1}{$t-1}){next;}  # links horizontally
		    }
		    if ($$srclinks{$s}{$t+1}){              # the same for the other
			if ($$srclinks{$s-1}{$t+1}){next;}  # adjacency direction
			if ($$srclinks{$s+1}{$t+1}){next;}
		    }
		    $$srclinks{$s}{$t}=1;        # everything ok: add the link
		    $$trglinks{$t}{$s}=1;
		}
	    }
	}
    }
}


#----------------------------------------------------------------------------
# RunGiza: run GIZA++ using external scripts
# (GIZA must be installed in the given directory)

sub RunGiza{
    my $src=shift;
    my $trg=shift;
    my $viterbi=shift;

    if (not -d $GIZAHOME){
        $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++";
    	if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";}
    	if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";}
    	if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";}
    	if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";}
        if (not -d $GIZAHOME){
	   warn "cannot find GIZA++ in $GIZAHOME! Set ENV{UPLUGHOME} or Uplug::Align::Word::Giza::GIZAHOME!";
	   return 0;
        }
    }
    if (my $sig=system "$GIZAHOME/plain2snt.out $src $trg"){
	die "got signal $? from plain2snt!\n";
    }
    my $command="PATH=\$\{PATH\}:$GIZAHOME;";
    my $snt="$src$trg\.snt";
    if (not -e $snt){$snt="$src\_$trg\.snt";}
    if (not -e $snt){die "cannot find alignment-file: $snt!\n";}
    $command.="$GIZAHOME/trainGIZA++.sh $src\.vcb $trg\.vcb $snt";
    if (my $sig=system $command){
	die "got signal $? from trainGIZA++.sh!\n";
    }
    if ($viterbi){copy ('GIZA++.A3.final',$viterbi);}
    else{copy ('GIZA++.A3.final','viterbi');}
}

#----------------------------------------------------------------------------
# Bitext2Text: convert bitexts from Uplug format (XML) to GIZA's format
# (this is much too slow ....)

sub Bitext2Text{
    my $bitext=shift;
    my $srcfile=shift;
    my $trgfile=shift;
    my $param=shift;

    my %SrcStream=('format'=>'text','file'=>$srcfile);
    my %TrgStream=('format'=>'text','file'=>$trgfile);

    my $input=Uplug::IO::Any->new($bitext);
    my $source=Uplug::IO::Any->new(\%SrcStream);
    my $target=Uplug::IO::Any->new(\%TrgStream);
    $input->open('read',$bitext)
	|| warn "cannot open the bitext" && return 0;
    $source->open('write',\%SrcStream) 
	|| warn "cannot write to $srcfile" && return 0;
    $target->open('write',\%TrgStream) 
	|| warn "cannot write to $trgfile" && return 0;

    #-------------------------------------------------------------------------

    my $data=Uplug::Data::Align->new();

    print STDERR "convert bitext to plain text!\n";
    my $count=0;
    while ($input->read($data)){
	$count++;
	if (not ($count % 100)){
	    $|=1;print STDERR '.';$|=0;
	}
	if (not ($count % 1000)){
	    $|=1;print STDERR "$count\n";$|=0;
	}

	my ($srctxt,$trgtxt)=&BitextStrings($data,$param);

	if (($srctxt=~/\S/) and ($trgtxt=~/\S/)){
	    $source->write($srctxt);
	    $target->write($trgtxt);
	}
    }
#     $BitextHeader=$input->header;
    $input->close;
    $source->close;
    $target->close;
    return $input->header;
}

#----------------------------------------------------------------------------
# get the actual strings from the bitext (using feature-parameters)
# (feature specifications as in coocfreq.pl)

sub BitextStrings{
    my $data=shift;
    my $param=shift;
    my ($srcnodes,$trgnodes)=@_;

    my @srctok=$data->getSrcTokenFeatures($param,$srcnodes);
    my @trgtok=$data->getTrgTokenFeatures($param,$trgnodes);

    map($_=~s/^\s+//sg,@srctok);         # delete initial white-space
    map($_=~s/^\s+//sg,@trgtok);
    map($_=~s/(\S)\s+$/$1/sg,@srctok);   # delete final white-space
    map($_=~s/(\S)\s+$/$1/sg,@trgtok);

    map($_=~s/\n/ /sg,@srctok);          # otherwise: convert to space
    map($_=~s/\n/ /sg,@trgtok);
    map($_=~s/\s/\_/sg,@srctok);         # and replace space with underline
    map($_=~s/\s/\_/sg,@trgtok);         # (to avoid extra tokens)
	
    my $srctxt=join(' ',@srctok);
    my $trgtxt=join(' ',@trgtok);

    $srctxt=~tr/\n/ /;
    $trgtxt=~tr/\n/ /;
    return ($srctxt,$trgtxt);
}

#----------------------------------------------------------------------------



=pod

=head1 Synopsis

 use lib '/path/to/uplug';
 use Uplug::Align::Word::Giza;

 $ENV{UPLUGHOME}='/path/to/uplug';


 my %bitext = ('file' => 'svenprf.xces',
               'format' => 'xces align');


 &Bitext2Text(\%bitext,'src','trg',{});     # convert to plain text
 &RunGiza('src','trg','viterbi.src-trg');   # run GIZA++ (src-->trg)
 &RunGiza('trg','src','viterbi.trg-src');   # run GIZA++ (trg-->src)

 my %dbm = (file=>'clues.dbm',
	    format=>'dbm',
	    key => ['source','target']);

 &Giza2Clue('.',         # directory where GIZA was running (=current)
	    {},          # parameter (= clue dbm header)
	    1,           # =1 --> inverse (trg-src)
	    \%dbm);      # clue DBM (giza.dbm if not specified)

 my $combine = 'intersection'       # combine heuristics (union|refined)
 my %out = ('file' => $combined,    # save the result in this file
            'format' => 'xces align');

  &Combined2Uplug('viterbi.src-trg',  # name of first viterbi alignment
                  'viterbi.trg-src',  # name of second viterbi alignment
                  $combine,           # type of combination heuristics
		  \%bitext,           # bitext
		  {},                 # token parameters
                  \%out);             # output stream


=head1 GIZA++ directories

You have to tell the program where it can find the GIZA++ executables.
You can either set the UPLUGHOME environment variable or the GIZAHOME variable.

 $ENV{UPLUGHOME}='/path/to/uplug';
 $Uplug::Align::Word::Giza::GIZAHOME="/path/to/uplug/ext/GIZA++";

=cut



1;
Alerts (15)

Complexity hotspot; lines 586 to 590 (total complexity: 5)
586 587 588 589 590
Complexity hotspot; lines 604 to 608 (total complexity: 5)
604 605 606 607 608
Complexity hotspot; lines 634 to 638 (total complexity: 5)
634 635 636 637 638