PageRenderTime 11ms CodeModel.GetById 2ms app.highlight 156ms RepoModel.GetById 2ms app.codeStats 0ms

/uplug-main/lib/Uplug/Align/Word/Clue.pm

https://bitbucket.org/tiedemann/uplug
Perl | 1757 lines | 1127 code | 335 blank | 295 comment | 143 complexity | cce623150dbedaaae122e96aa1b531ca MD5 | raw file
   1#-*-perl-*-
   2####################################################################
   3# Copyright (C) 2004 J�rg Tiedemann  <joerg@stp.ling.uu.se>
   4#
   5# This program is free software; you can redistribute it and/or modify
   6# it under the terms of the GNU General Public License as published by
   7# the Free Software Foundation; either version 2 of the License, or
   8# (at your option) any later version.
   9#
  10# This program is distributed in the hope that it will be useful,
  11# but WITHOUT ANY WARRANTY; without even the implied warranty of
  12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13# GNU General Public License for more details.
  14#
  15# You should have received a copy of the GNU General Public License
  16# along with this program; if not, write to the Free Software
  17# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18#
  19###########################################################################
  20# Uplug::Align::Word::Clue
  21#
  22#
  23#
  24###########################################################################
  25
  26
  27package Uplug::Align::Word::Clue;
  28
  29use strict;
  30# use Time::HiRes qw(time);
  31
  32
  33use vars qw(@ISA $DEBUG);
  34use vars qw($INPHRASESONLY $ADJACENTONLY $ADJACENTSCORE $FILLPHRASES);
  35use vars qw($ALLOWMULTIOVERLAP $PRINTHTML);
  36# use utf8;
  37use Uplug::Data;
  38use Uplug::Align::Word;
  39use Data::Dumper;
  40$Data::Dumper::Indent=1;
  41$Data::Dumper::Terse=1;
  42$Data::Dumper::Purity=1;
  43
  44@ISA = qw( Uplug::Align::Word );
  45
  46$DEBUG = 0;
  47
  48#---------------------------------
  49# parameters for add2LinkCluster
  50
  51$INPHRASESONLY = 0;          # if = 1 --> no links outside of chunks
  52$ADJACENTONLY = 0;           # if = 1 --> allow only adjacent links
  53$ADJACENTSCORE = 0;          # if > 0 --> $score >= $neighbor * $ADJACENTSCORE
  54# $ALLOWMULTIOVERLAP = 0;      # allow overlap with more than 1 link cluster!
  55# $ADJACENTSCORE = 0.4;
  56$ADJACENTSCORE = 0;
  57
  58$FILLPHRASES = 0;            #  ... doesn't work ....
  59
  60
  61
  62sub new{
  63    my $class=shift;
  64    my $self=$class->SUPER::new(@_);
  65    if (not defined $self->parameter('adjacent_only')){
  66	$self->setParameter('adjacent_only',$ADJACENTONLY);
  67    }
  68    if (not defined $self->parameter('adjacent_score')){
  69	$self->setParameter('adjacent_score',$ADJACENTSCORE);
  70    }
  71    if (not defined $self->parameter('in_phrases_only')){
  72	$self->setParameter('in_phrases_only',$INPHRASESONLY);
  73    }
  74    if (not defined $self->parameter('fill_phrase')){
  75	$self->setParameter('fill_phrases',$FILLPHRASES);
  76    }
  77    if (not defined $self->parameter('allow_multiple_overalps')){
  78	$self->setParameter('allow_multiple_overlaps',$ALLOWMULTIOVERLAP);
  79    }
  80    if (not defined $self->parameter('verbose')){
  81	$self->setParameter('verbose',$DEBUG);
  82    }
  83    else{$DEBUG=$self->parameter('verbose');}
  84
  85    return $self;
  86}
  87
  88sub DESTROY {
  89    my $self = shift;
  90}
  91
  92
  93
  94#===========================================================================
  95#
  96# get all link scores and fill the clue matrix
  97#
  98#===========================================================================
  99
 100sub getLinkScores{
 101    my $self=shift;
 102
 103    $self->{matrix}=[];
 104    $self->{links}={};
 105    my $LinkProb=$self->{matrix};
 106    my $links=$self->{linkStreams};
 107    my $SrcTok=$self->{srcToken};
 108    my $TrgTok=$self->{trgToken};
 109    my $Param=$self->{param};
 110    my $data=$self->{data};
 111
 112    ## prepare clue param hash (reduce checks in the loop below)
 113    my %ClueParam=%{$Param};
 114    if (exists $ClueParam{general}){delete $ClueParam{general};}
 115    if (exists $ClueParam{original}){delete $ClueParam{original};}
 116    foreach (keys %ClueParam){
 117	if (ref($ClueParam{$_}) ne 'HASH'){$ClueParam{$_}={};}
 118	if (not defined $ClueParam{$_}{'score weight'}){
 119	    $ClueParam{$_}{'score weight'}=$self->defaultClueWeight();
 120	}
 121    }
 122
 123    ## define some variables used in the loop
 124    my $weight;           # clue combination weight
 125    my ($src,$trg);       # source and target language tokens
 126    my $score;            # clue score found for the current pair
 127    my $time;             # time (for debugging)
 128    my %search;           # hash of patterns for searching clues
 129    my $found=Uplug::Data->new;  # clues found
 130    my @SrcTok;           # positions of the current source
 131    my @TrgTok;           # and target tokens in the sentence
 132    my ($s,$t,$x,$y,$p);  # variables for iteration
 133    my $ScoreComb=$self->parameter('score combination');
 134    if (not $ScoreComb){$ScoreComb='probabilistic';}
 135
 136    if ($self->parameter('verbose')){
 137      print STDERR "\n=====================================================\n";
 138      print STDERR "matching clue scores";
 139      print STDERR "\n=====================================================\n";
 140    }
 141
 142
 143    ## the following loop takes most of the time!
 144
 145    foreach $s (sort {$a <=> $b} keys %{$SrcTok}){
 146	foreach $t (keys %{$TrgTok}){
 147
 148	    $time=time();
 149
 150	    ($src,$trg)=($$SrcTok{$s}{general},$$TrgTok{$t}{general});
 151	    $self->alignIdentical($src,$trg,$s,$t,$LinkProb);
 152
 153	    ### DEBUG: store search time
 154	    $self->{identical_score_time}+=time()-$time if ($DEBUG);
 155
 156	    foreach (keys %ClueParam){
 157
 158		$time=time();
 159
 160		$weight=$ClueParam{$_}{'score weight'};
 161		if ($ClueParam{$_}{'relative position'}){
 162		    ($src,$trg)=$self->makeRelPosFeature($$SrcTok{$s}{$_},
 163							 $$TrgTok{$t}{$_});
 164		}
 165		else{($src,$trg)=($$SrcTok{$s}{$_},$$TrgTok{$t}{$_});}
 166
 167		### DEBUG: store search time
 168		$self->{before_score_time}+=time()-$time if ($DEBUG);
 169
 170		$score=0;
 171
 172		#---------------------------------------
 173		# length difference as  scores ...
 174		#---------------------------------------
 175
 176		if ($ClueParam{$_}{'string length difference'}){
 177		    $score=$data->lengthQuotient($src,$trg);
 178		}
 179
 180		#---------------------------------------
 181		# otherwise: search scores in link-DB
 182		#---------------------------------------
 183
 184		else{
 185		    if (not defined $links->{$_}){next;}
 186		    if (defined($src) and defined($trg)){
 187			%search=('source' => $src,
 188				 'target' => $trg);
 189			$time=time();
 190			if ($links->{$_}->select($found,\%search)){
 191			    $score=$found->attribute('score');
 192			}
 193			### DEBUG: store search time
 194			$self->{search_score_time}+=time()-$time if ($DEBUG);
 195		    }
 196		}
 197
 198		$time=time();
 199
 200		#---------------------------------------
 201		# set weighted score in score matrix
 202		#---------------------------------------
 203
 204		if (not $score){next;}
 205		if (not $data->checkPairParameter($src,$trg,$ClueParam{$_})){
 206		    ### DEBUG: store search time
 207		    $self->{after_score_time}+=time()-$time if ($DEBUG);
 208		    next;
 209		}
 210
 211		if (exists $ClueParam{$_}{'minimal score'}){
 212		    if ($score<$ClueParam{$_}{'minimal score'}){
 213			### DEBUG: store search time
 214			$self->{after_score_time}+=time()-$time if ($DEBUG);
 215			next;
 216		    }
 217		}
 218
 219		$score*=$weight;
 220
 221		# shouldn't be >1, but in case ...
 222		#--------------------------------
 223		if ($score>=1){$score=0.999999999999;}
 224		#--------------------------------
 225
 226		if ($self->parameter('verbose')){
 227#		    printf STDERR "%20s [ %s %s ] %15s - %-15s %s\n",
 228		    printf STDERR "$_\t$s\t$t\t$src\t$trg\t$score\n";
 229		}
 230
 231		@SrcTok=split(/:/,$s);
 232		@TrgTok=split(/:/,$t);
 233
 234		foreach $x (@SrcTok){
 235		    foreach $y (@TrgTok){
 236
 237#			if ($self->parameter('verbose')){
 238#			    printf STDERR "%20s [%d %d] %15s - %-15s %s\n",
 239#			    $_,$x,$y,$src,$trg,$score;
 240#			}
 241
 242			if ($ScoreComb eq 'addition'){
 243			    $$LinkProb[$x][$y]+=$score;
 244			}
 245#
 246# log-linear and multiplication are useless!
 247# * there's not always a positive score for each possible pair! 
 248#   --> multiplications with one factor = 0 --> score = 0
 249#   --> leaving out zero-factors -> implicit penalty for pairs with multiple
 250#                                   clue scores
 251#
 252#			elsif ($ScoreComb eq 'log-linear'){
 253#			    $$LinkProb[$x][$y]+=log($score);
 254#			}
 255#			elsif ($ScoreComb eq 'multiplication'){
 256#			    $$LinkProb[$x][$y]+=log($score);
 257#			}
 258			else{
 259			    $p=$$LinkProb[$x][$y];
 260			    $$LinkProb[$x][$y]=$p+$score-$p*$score;
 261			}
 262		    }
 263		}
 264
 265		### DEBUG: store search time
 266		$self->{after_score_time}+=time()-$time if ($DEBUG);
 267
 268	    }
 269	}
 270    }
 271
 272    $time=time();
 273    $self->align1x($LinkProb);
 274
 275#    if ($ScoreComb eq 'log-linear'){              # special for log-linear:
 276#	foreach $x (0..$#{$LinkProb}){            # reverse log (make positiv
 277#	    foreach $y (0..$#{$$LinkProb[$x]}){   # score values)
 278#		$$LinkProb[$x][$y]=exp($$LinkProb[$x][$y]);
 279#	    }
 280#	}
 281#    }
 282
 283    if ($self->parameter('verbose')){
 284	$self->printClueMatrix($self->{token}->{source},
 285			       $self->{token}->{target},
 286			       $self->{matrix});
 287
 288	$self->printBitextTokensWithID();
 289
 290#	$self->printBitextToken($self->{token}->{source},
 291#				$self->{token}->{target});
 292
 293    }
 294    ### DEBUG: store search time
 295    $self->{'1x_score_time'}+=time()-$time if ($DEBUG);
 296}
 297
 298
 299#===========================================================================
 300#
 301# search for the best alignment using the clue matrix scores
 302#
 303#   topLinkSearch ........ iteratively add top links to link clusters
 304#   nextBestSearch ....... score = distance to next best link (+topLinkSearch)
 305#   oneOneFirstSearch .... non-overlapping first, overlapping then
 306#   competitiveSearch .... competitive linking (1:1 links only!)
 307#   bidirectionalRefineSearch  intersection of directional links + overlapping
 308#   directionalSrcSearch ..... best alignment source --> target
 309#   directionalTrgSearch ..... best alignment target --> source
 310#   bidirectionalUnion ....... union of directionalSrc & directionalTrg
 311#   bidirectionalIntersection  intersection of directionalSrc & directionalTrg
 312#
 313# parameter search: nextbest ........ nextBestSearch
 314#                   oneone....... ... oneOneFirstSearch
 315#                   competitive ..... competitiveSearch
 316#                   myrefined ....... bidirectionalRefinedSearch
 317#                   och ............. bidirectionalRefinedSearchOch
 318#                   src ............. directionalSrcSearch
 319#                   trg ............. directionalTrgSearch
 320#                   union ........... bidirectionalUnion
 321#                   intersection .... bidirectionalIntersection
 322#                   <default> ....... topLinkSearch
 323#
 324#===========================================================================
 325
 326
 327sub findAlignment{
 328    my $self=shift;
 329    $self->{links}={};
 330    my $minScore=$self->scoreThreshold();
 331    my $method=$self->parameter('search');
 332
 333    if ($method=~/nextbest/){
 334	return $self->nextBestSearch($self->{links},$minScore);}
 335    elsif ($method=~/competitive/){
 336	return $self->competitiveSearch($self->{links},$minScore);}
 337    elsif ($method=~/oneone/){
 338	return $self->oneOneFirstSearch($self->{links},$minScore);}
 339    elsif ($method=~/myrefined/){
 340	return $self->bidirectionalRefinedSearch($self->{links},$minScore);}
 341    elsif ($method=~/(och|refined)/){
 342	return $self->bidirectionalRefinedSearchOch($self->{links},$minScore);}
 343    elsif ($method=~/src/){
 344	return $self->directionalSrcSearch($self->{links},$minScore);}
 345    elsif ($method=~/trg/){
 346	return $self->directionalTrgSearch($self->{links},$minScore);}
 347    elsif ($method=~/union/){
 348	return $self->bidirectionalUnion($self->{links},$minScore);}
 349    elsif ($method=~/intersection/){
 350	return $self->bidirectionalIntersection($self->{links},$minScore);}
 351    else{
 352	return $self->topLinkSearch($self->{links},$minScore);}
 353}
 354
 355
 356
 357
 358
 359#===========================================================================
 360# add scores to the clue matrix for 
 361# sentence alignments with only 1 word in either source or target
 362#===========================================================================
 363
 364
 365sub align1x{
 366    my $self=shift;
 367    my ($LinkProb)=@_;
 368    my $Align11=$self->parameter('align 1:1');
 369    my $Align1x=$self->parameter('align 1:x');
 370    if ($Align11 and ($#{$LinkProb}==0)){
 371	if ($#{$$LinkProb[0]}==0){
 372	    my $p=$$LinkProb[0][0];
 373	    $$LinkProb[0][0]=$p+$Align11-$p*$Align11;
 374	    return;
 375	}
 376    }
 377    if ($Align1x and ($#{$LinkProb}==0)){
 378	foreach (0..$#{$$LinkProb[0]}){
 379	    my $p=$$LinkProb[0][$_];
 380	    $$LinkProb[0][$_]=$p+$Align1x-$p*$Align1x;
 381	}
 382	return;
 383    }
 384    if ($Align1x){
 385	my $ok=1;
 386	foreach (0..$#{$LinkProb}){
 387	    if ($#{$$LinkProb[$_]}!=0){$ok=0;}
 388	}
 389	if ($ok){
 390	    foreach (0..$#{$LinkProb}){
 391		my $p=$$LinkProb[$_][0];
 392		$$LinkProb[$_][0]=$p+$Align1x-$p*$Align1x;
 393	    }
 394	}
 395    }
 396}
 397
 398#===========================================================================
 399# add scores to the clue matrix for
 400# pairs of identical tokens with at least one non-alphabetical character
 401# (hard-coded as /[^A-Za-z]/ !!!!!!)
 402#===========================================================================
 403
 404sub alignIdentical{
 405    my $self=shift;
 406    my $AlignIdentical=$self->parameter('align identical');
 407    if (not $AlignIdentical){return;}
 408    my ($src,$trg,$s,$t,$LinkProb)=@_;
 409    if ($src=~/[^A-Za-z]/){
 410	if ($src eq $trg){
 411	    my @SrcTok=split(/:/,$s);
 412	    my @TrgTok=split(/:/,$t);
 413	    foreach my $x (@SrcTok){
 414		foreach my $y (@TrgTok){
 415		    my $p=$$LinkProb[$x][$y];
 416		    $$LinkProb[$x][$y]=$p+$AlignIdentical-$p*$AlignIdentical;
 417		}
 418	    }
 419	}
 420    }
 421}
 422
 423
 424
 425
 426#===========================================================================
 427#
 428# topLinkSearch:
 429#   1) search best link in the matrix
 430#   2) add link to link clusters
 431#   3) continue with 1) until finished
 432#
 433#===========================================================================
 434
 435sub topLinkSearch{
 436    my $self=shift;
 437    my $Links=shift;
 438    my $MinScore=shift;
 439
 440    my $LinkProb=$self->{matrix};
 441    my $Token=$self->{token};
 442    my $TokenAttr=$self->{tokenAttr};
 443
 444    my @SrcLinks;
 445    my @TrgLinks;
 446    my $NrSrc=$#{$$Token{source}};
 447    my $NrTrg=$#{$$Token{target}};
 448
 449    my @LinkMatrix;
 450    my @LinkCluster;
 451    my ($x,$y);
 452
 453
 454    # ----------------------------
 455
 456#    print STDERR "---------new sentence-------$MinScore-------\n";
 457    undef $self->{SORTEDLINKS};
 458
 459    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);   # clone the matrix
 460
 461    while (($x,$y)=$self->getTopLink(\@LinkMatrix,$MinScore)){
 462#	print STDERR "$x:$y:$LinkMatrix[$x][$y]\n";
 463	if ($MinScore=~/\%/){
 464	    $MinScore=$LinkMatrix[$x][$y]*$MinScore/100;
 465#	    print STDERR "## minscore == $MinScore\n";
 466	}
 467	if (not defined($LinkMatrix[$x][$y])){last;}
 468	if ($LinkMatrix[$x][$y]<$MinScore){last;}
 469
 470	if ($self->add2LinkCluster($x,$y,\@LinkCluster)){
 471	    $LinkMatrix[$x][$y]=0;
 472	}
 473    }
 474
 475
 476    # ----------------------------
 477    # get the links from the set of link clusters
 478
 479    $self->getClusterLinks(\@LinkCluster,$Links);     # get links
 480}
 481
 482
 483
 484
 485#===========================================================================
 486#
 487# nextBestSearch:
 488#    1) find score distance to "next best link" for each word pair
 489#    2) call topLinkSearch
 490#
 491#===========================================================================
 492
 493
 494
 495sub nextBestSearch{
 496    my $self=shift;
 497    my $LinkProb=$self->{matrix};
 498    $self->nextBestMatrix($LinkProb);
 499    return $self->topLinkSearch(@_);
 500}
 501
 502
 503sub nextBestMatrix{
 504    my $self=shift;
 505    my ($LinkProb)=@_;
 506
 507    my @SortedColumns=();
 508    my @SortedRows=();
 509
 510    my $sizeX=$#{$LinkProb};
 511    my $sizeY=$#{$$LinkProb[0]};
 512
 513    foreach my $x (0..$sizeX){
 514	@{$SortedColumns[$x]}=
 515	    sort {$$LinkProb[$x][$b] <=> $$LinkProb[$x][$a]} (0..$sizeY);
 516    }
 517    foreach my $y (0..$sizeY){
 518	@{$SortedRows[$y]}=
 519	    sort {$$LinkProb[$b][$y] <=> $$LinkProb[$a][$y]} (0..$sizeX);
 520    }
 521
 522    my @LinkMatrix=();
 523    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
 524
 525    my $lowest=0;
 526
 527    foreach my $x (0..$sizeX){
 528	foreach my $y (0..$sizeY){
 529	    my $NextBestY=$SortedColumns[$x][0];
 530	    if ($NextBestY==$y){$NextBestY=$SortedColumns[$x][1];}
 531	    my $NextBestX=$SortedRows[$y][0];
 532	    if ($NextBestX==$x){$NextBestX=$SortedRows[$y][1];}
 533	    my $NextBest=$LinkMatrix[$NextBestX][$y];
 534	    if ($LinkMatrix[$x][$NextBestY]>$NextBest){
 535		$NextBest=$LinkMatrix[$x][$NextBestY];
 536	    }
 537	    $$LinkProb[$x][$y]-=$NextBest;
 538	    if ($$LinkProb[$x][$y]<$lowest){
 539		$lowest=$$LinkProb[$x][$y];
 540	    }
 541	}
 542    }
 543    foreach my $x (0..$sizeX){               # normalize!
 544	foreach my $y (0..$sizeY){           # no negative values
 545	    $$LinkProb[$x][$y]-=$lowest;     # in the matrix!
 546	}
 547    }
 548    if ($self->parameter('verbose')){
 549	$self->printClueMatrix($self->{token}->{source},
 550			       $self->{token}->{target},
 551			       $LinkProb);
 552    }
 553}
 554
 555
 556
 557#===========================================================================
 558#
 559# oneOneFirstSearch:
 560#    1) find all one-to-one word links first (non-overlapping links)
 561#    2) add iteratively overlapping links
 562#
 563#===========================================================================
 564
 565sub oneOneFirstSearch{
 566    my $self=shift;
 567    my $Links=shift;
 568    my $MinScore=shift;
 569
 570    my $LinkProb=$self->{matrix};
 571    my $Token=$self->{token};
 572    my $TokenAttr=$self->{tokenAttr};
 573
 574    my @SrcLinks;
 575    my @TrgLinks;
 576    my $NrSrc=$#{$$Token{source}};
 577    my $NrTrg=$#{$$Token{target}};
 578
 579    my @LinkMatrix;
 580    my @LinkCluster;
 581    my ($x,$y);
 582
 583
 584    # ----------------------------
 585    # 1) get all word-to-word links without any overlaps
 586
 587    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);   # clone the matrix
 588
 589    while (($x,$y)=$self->getTopLink(\@LinkMatrix,$MinScore)){
 590	if ($MinScore=~/\%/){
 591	    $MinScore=$LinkMatrix[$x][$y]*$MinScore/100;
 592	    print STDERR "## minscore == $MinScore\n";
 593	}
 594	if ($LinkMatrix[$x][$y]<$MinScore){last;}
 595	my @overlap=$self->findClusterOverlap($x,$y,\@LinkCluster);
 596
 597	if (not @overlap){
 598	    $LinkCluster[$#LinkCluster+1]={};
 599	    $LinkCluster[-1]{src}{$x}=1;
 600	    $LinkCluster[-1]{trg}{$y}=1;
 601	}
 602	$LinkMatrix[$x][$y]=0;
 603    }
 604
 605    # ----------------------------
 606    # 2) do it again --> find overlapping links!
 607
 608    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);   # clone the matrix
 609
 610    while (($x,$y)=$self->getTopLink(\@LinkMatrix,$MinScore)){
 611	if ($LinkMatrix[$x][$y]<$MinScore){last;}
 612	$self->add2LinkCluster($x,$y,\@LinkCluster);
 613	$LinkMatrix[$x][$y]=0;
 614    }
 615
 616
 617    # ----------------------------
 618    # get the links from the set of link clusters
 619
 620    $self->getClusterLinks(\@LinkCluster,$Links);     # get links
 621}
 622
 623
 624
 625
 626#===========================================================================
 627# ------------------  directional alignment (source to target) ----------------
 628#===========================================================================
 629
 630sub directionalSrcSearch{
 631    my $self=shift;
 632    my $Links=shift;
 633    my $MinScore=shift;
 634    my $competitive=shift;
 635
 636    my @LinkCluster;
 637    my ($x,$y);
 638
 639    my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
 640
 641    foreach (0..$#SrcLinks){
 642	if ((defined $SrcLinks[$_]) and 
 643	    ($SrcLinks[$_] > 0)){
 644	    $self->add2LinkCluster($_,$SrcLinks[$_],\@LinkCluster);
 645	}
 646    }
 647
 648    $self->getClusterLinks(\@LinkCluster,$Links);
 649}
 650
 651#===========================================================================
 652# ------------------  directional alignment (target to source ) ---------------
 653#===========================================================================
 654
 655sub directionalTrgSearch{
 656    my $self=shift;
 657    my $Links=shift;
 658    my $MinScore=shift;
 659    my $competitive=shift;
 660
 661    my @LinkCluster;
 662    my ($x,$y);
 663
 664    my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
 665    foreach (0..$#TrgLinks){
 666	if ((defined $TrgLinks[$_]) and 
 667	    ($TrgLinks[$_] > 0)){
 668	    $self->add2LinkCluster($TrgLinks[$_],$_,\@LinkCluster);
 669	}
 670    }
 671
 672    $self->getClusterLinks(\@LinkCluster,$Links);
 673}
 674
 675
 676#===========================================================================
 677# competitive linking
 678#   1) get best word-to-word link (s,t)
 679#   2) remove alternative links for (s) and for (t)
 680#   3) go to 1) until finished
 681#===========================================================================
 682
 683sub competitiveSearch{
 684    my $self=shift;
 685    my $Links=shift;
 686    my $MinScore=shift;
 687    if (not defined $MinScore){
 688	$MinScore=0.00000000000001;
 689    }
 690
 691    my $Token=$self->{token};
 692    my $NrSrc=$#{$$Token{source}};
 693    my $NrTrg=$#{$$Token{target}};
 694
 695    my @WordLinks=();
 696    if ($NrTrg>$NrSrc){
 697	return $self->directionalTrgSearch($Links,$MinScore,1);
 698    }
 699    return $self->directionalSrcSearch($Links,$MinScore,1);
 700}
 701
 702#===========================================================================
 703# refined symmetric link search a la Och&Ney
 704#
 705#===========================================================================
 706
 707sub bidirectionalRefinedSearchOch{
 708    my $self=shift;
 709    my $Links=shift;
 710    my $MinScore=shift;
 711    my $competitive=shift;
 712    if (not defined $MinScore){
 713	$MinScore=0.00000000000001;
 714    }
 715
 716    my $LinkProb=$self->{matrix};
 717    my @LinkCluster;
 718    my %WordLinks=();
 719    my %InvWordLinks=();
 720    my ($x,$y);
 721
 722    #-----------------------------------
 723    # 1) get directional links
 724
 725    my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
 726    my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
 727
 728    #-----------------------------------
 729    # 2) intersection of directional links
 730
 731    foreach (0..$#SrcLinks){
 732	if ((defined $SrcLinks[$_]) and
 733	    ($TrgLinks[$SrcLinks[$_]] eq $_)){
 734	    $WordLinks{$_}{$SrcLinks[$_]}=1;
 735	    $InvWordLinks{$SrcLinks[$_]}{$_}=1;
 736#	    print STDERR "$_ --> $SrcLinks[$_]\n";
 737	}
 738    }
 739
 740
 741    #-----------------------------------
 742    # 3) add overlapping links
 743    #    * sort all scores in the matrix
 744    #    * run through possible links starting with the highest score
 745    #    * repeat until no more links can be added
 746    #
 747    # links (s,t) are added if
 748    #    * there is no other link for both, s AND t
 749    #    * or ..the new link is adjacent to another link in source OR target
 750    #           and thew new link does not create links which have neighbors
 751    #           in both directions
 752
 753    my %scores=();
 754    foreach my $s (0..$#{$LinkProb}){
 755	foreach my $t (0..$#{$$LinkProb[$s]}){   # put all scores
 756	    $scores{"$s:$t"}=$$LinkProb[$s][$t]; # in a long list
 757	}
 758    }
 759
 760    my $added=0;
 761    do{
 762	$added=0;
 763	foreach my $pair (sort {$scores{$b} <=> $scores{$a} } keys %scores){
 764	    if ($scores{$pair}<$MinScore){last;}
 765	    my ($s,$t)=split(/\:/,$pair);
 766
 767	    if (((not defined $WordLinks{$s}) or      # if no other links
 768		 (not keys %{$WordLinks{$s}})) and    # are defined for both,
 769		((not defined $InvWordLinks{$t}) or   # source AND target
 770		 (not keys %{$InvWordLinks{$t}}))){   # word:
 771		$added++;
 772		$scores{$pair}=0;                     # add the link
 773		$WordLinks{$s}{$t}=1;
 774		$InvWordLinks{$t}{$s}=1;
 775#		print STDERR "add $s --> $t (new)\n";
 776	    }
 777	    elsif ((($s>0) and 
 778		    (defined $WordLinks{$s-1}{$t})) or    # the link has a
 779		   (defined $WordLinks{$s+1}{$t}) or      # vertical neighbor
 780		   (($t>0) and 
 781		    (defined $WordLinks{$s}{$t-1})) or    # or a
 782		   (defined $WordLinks{$s}{$t+1})){       # horizontal neighbor
 783
 784		$InvWordLinks{$t}{$s}=1;
 785		$WordLinks{$s}{$t}=1;                     # if there are
 786		if (&CheckWordLinks(\%WordLinks,          # no links with
 787				    \%InvWordLinks)){     # neighbors in both
 788		    $added++;                             # dimensions! -->
 789		    $scores{$pair}=0;                     # add the new link
 790#		    print STDERR "add $s --> $t (adj)\n";
 791		}
 792		else{                                     # else:
 793		    delete $WordLinks{$s}{$t};            # delete the link
 794		    delete $InvWordLinks{$t}{$s};
 795#		    print STDERR "reject $s --> $t\n";
 796		}
 797	    }
 798	}
 799    }
 800    until (not $added);      # repeat as long as links are added!
 801
 802    $self->setParameter('adjacent_only',0);
 803    $self->setParameter('adjacent_score',0);
 804
 805    foreach my $s (keys %WordLinks){                      # put word-to-word
 806	foreach my $t (keys %{$WordLinks{$s}}){           # links together
 807	    $self->add2LinkCluster($s,$t,\@LinkCluster);  # (link clusters)
 808	}
 809    }
 810
 811
 812    #-----------------------------------
 813    # 4) convert link cluster to word/phrase links
 814
 815    $self->getClusterLinks(\@LinkCluster,$Links);
 816}
 817
 818#-------------------------------------------------------------------------
 819# check if there are alignments containing horicontal AND vertical links
 820# (---> return 0 if there are such links!)
 821
 822sub CheckWordLinks{
 823    my $srclinks=shift;
 824    my $trglinks=shift;
 825
 826    foreach my $s (keys %{$srclinks}){
 827	foreach my $t (keys %{$$srclinks{$s}}){
 828	    if (keys %{$$srclinks{$s}} > 1){
 829		if (keys %{$$trglinks{$t}} > 1){
 830		    return 0;
 831		}
 832	    }
 833	}
 834    }
 835    return 1;
 836}
 837
 838
 839#===========================================================================
 840# symmetric alignment (bi-directional)
 841#   1) get links in both directions
 842#   2) get intersection of links
 843#   3) iteratively add new links to existing link clusters
 844#===========================================================================
 845
 846
 847sub bidirectionalRefinedSearch{
 848    my $self=shift;
 849    my $Links=shift;
 850    my $MinScore=shift;
 851    my $competitive=shift;
 852    if (not defined $MinScore){
 853	$MinScore=0.00000000000001;
 854    }
 855
 856    my $LinkProb=$self->{matrix};
 857    my @LinkCluster;
 858    my ($x,$y);
 859
 860    #-----------------------------------
 861    # 1) get directional links
 862
 863    my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
 864    my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
 865
 866    #-----------------------------------
 867    # 2) intersection of directional links
 868
 869    foreach (0..$#SrcLinks){
 870	if ((defined $SrcLinks[$_]) and
 871	    ($TrgLinks[$SrcLinks[$_]] eq $_)){
 872	    $self->add2LinkCluster($_,$SrcLinks[$_],
 873				   \@LinkCluster);  # (link clusters)
 874	}
 875    }
 876
 877
 878    #-----------------------------------
 879    # 3) add overlapping links
 880    #    * sort all scores in the matrix
 881    #    * run through possible links starting with the highest score
 882    #    * repeat until no more links can be added
 883    #
 884    # links (s,t) are added if
 885    #    * there is no other link for both, s AND t
 886    #    * or ..the new link is adjacent to another link in source OR target
 887    #           and thew new link does not create links which have neighbors
 888    #           in both directions
 889
 890    my %scores=();
 891    foreach my $s (0..$#{$LinkProb}){
 892	foreach my $t (0..$#{$$LinkProb[$s]}){   # put all scores
 893	    $scores{"$s:$t"}=$$LinkProb[$s][$t]; # in a long list
 894	}
 895    }
 896
 897    my $added=0;
 898    do{
 899	$added=0;
 900	foreach my $pair (sort {$scores{$b} <=> $scores{$a} } keys %scores){
 901	    if ($scores{$pair}<$MinScore){last;}
 902	    my ($s,$t)=split(/\:/,$pair);
 903
 904	    if ($self->add2LinkCluster($s,$t,\@LinkCluster)){
 905		$added++;
 906		delete $scores{$pair};
 907	    }
 908	}
 909    }
 910    until (not $added);      # repeat as long as links are added!
 911
 912    #-----------------------------------
 913    # 4) convert link cluster to word/phrase links
 914
 915    $self->getClusterLinks(\@LinkCluster,$Links);
 916}
 917
 918
 919
 920
 921# ------------------  bi-directional alignment (union) ------------------
 922#
 923# union of links in both diretions
 924#
 925
 926sub bidirectionalUnion{
 927    my $self=shift;
 928    my $Links=shift;
 929    my $MinScore=shift;
 930    my $competitive=shift;
 931
 932    my @LinkCluster;
 933    my ($x,$y);
 934
 935    my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
 936    foreach (0..$#SrcLinks){
 937	if (defined $SrcLinks[$_]){
 938	    $self->add2LinkCluster($_,$SrcLinks[$_],\@LinkCluster);
 939	}
 940    }
 941    my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
 942    foreach (0..$#TrgLinks){
 943	if (defined $TrgLinks[$_]){
 944	    $self->add2LinkCluster($TrgLinks[$_],$_,\@LinkCluster);
 945	}
 946    }
 947
 948    $self->getClusterLinks(\@LinkCluster,$Links);
 949}
 950
 951
 952# ------------------  bi-directional alignment (intersection) -------------
 953#
 954# intersection of links in both directions
 955#
 956
 957sub bidirectionalIntersection{
 958    my $self=shift;
 959    my $Links=shift;
 960    my $MinScore=shift;
 961    my $competitive=shift;
 962
 963    my @LinkCluster;
 964    my ($x,$y);
 965
 966    my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
 967    my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
 968
 969    foreach (0..$#SrcLinks){
 970	if ((defined $SrcLinks[$_]) and
 971	    ($TrgLinks[$SrcLinks[$_]] eq $_)){
 972	    $self->add2LinkCluster($_,$SrcLinks[$_],\@LinkCluster);
 973	    $SrcLinks[$_]=undef;
 974	    $TrgLinks[$SrcLinks[$_]]=undef;
 975	}
 976    }
 977
 978    $self->getClusterLinks(\@LinkCluster,$Links);
 979}
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991# ------------------------------------
 992# get best links from source to target words
 993
 994sub bestSrcLinks{
 995    my $self=shift;
 996    my $MinScore=shift;                 # score threshold
 997    my $competitive=shift;              # enable/disable competive linking
 998
 999    if ($competitive){
1000	return $self->competitiveSrcLinks($MinScore,@_);
1001    }
1002
1003    my $LinkProb=$self->{matrix};
1004    my $Token=$self->{token};
1005    my $NrSrc=$#{$$Token{source}};
1006    my $NrTrg=$#{$$Token{target}};
1007
1008    my @Links=();
1009
1010    # ----------------------------
1011    my @LinkMatrix=();
1012    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
1013    # ----------------------------
1014
1015    foreach my $s (0..$NrSrc){
1016	my $bestLink=0;
1017	my $bestScore=$LinkMatrix[$s][$bestLink];
1018	foreach my $t (1..$NrTrg){
1019	    if ($LinkMatrix[$s][$t]>$bestScore){
1020		$bestLink=$t;
1021		$bestScore=$LinkMatrix[$s][$bestLink];
1022	    }
1023	}
1024	if ($LinkMatrix[$s][$bestLink]<$MinScore){next;}
1025#	if ($LinkMatrix[$s][$bestLink]<$MinScore){last;}
1026	$Links[$s]=$bestLink;
1027    }
1028
1029    return @Links;
1030}
1031
1032# ------------------------------------
1033# get best links from target to source words
1034
1035sub bestTrgLinks{
1036    my $self=shift;
1037    my $MinScore=shift;                 # score threshold
1038    my $competitive=shift;              # enable/disable competive linking
1039
1040    if ($competitive){
1041	return $self->competitiveTrgLinks($MinScore,@_);
1042    }
1043
1044    my $LinkProb=$self->{matrix};
1045    my $Token=$self->{token};
1046    my $NrSrc=$#{$$Token{source}};
1047    my $NrTrg=$#{$$Token{target}};
1048
1049    my @Links=();
1050
1051    # ----------------------------
1052    my @LinkMatrix=();
1053    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
1054    # ----------------------------
1055
1056    foreach my $t (0..$NrTrg){
1057	my $bestLink=0;
1058	my $bestScore=$LinkMatrix[$bestLink][$t];
1059	foreach my $s (1..$NrSrc){
1060	    if ($LinkMatrix[$s][$t]>$bestScore){
1061		$bestLink=$s;
1062		$bestScore=$LinkMatrix[$bestLink][$t];
1063	    }
1064	}
1065	if ($LinkMatrix[$bestLink][$t]<$MinScore){next;}
1066#	if ($LinkMatrix[$bestLink][$t]<$MinScore){last;}
1067	$Links[$t]=$bestLink;
1068    }
1069    return @Links;
1070}
1071
1072
1073# ------------------------------------
1074# competitive linking from source to target
1075
1076
1077sub competitiveSrcLinks{
1078    my $self=shift;
1079    my $MinScore=shift;                 # score threshold
1080
1081    my $LinkProb=$self->{matrix};
1082    my $Token=$self->{token};
1083    my $NrSrc=$#{$$Token{source}};
1084    my $NrTrg=$#{$$Token{target}};
1085    my @Links=();
1086
1087    # ----------------------------
1088    my @LinkMatrix=();
1089    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
1090    # ----------------------------
1091
1092    my ($s,$t);
1093    while (($s,$t)=$self->getTopLink(\@LinkMatrix,$MinScore)){
1094	if ($LinkMatrix[$s][$t]<$MinScore){next;}
1095	$LinkMatrix[$s][$t]=0;
1096
1097	$Links[$s]=$t;
1098
1099	foreach my $x (0..$NrSrc){$LinkMatrix[$x][$t]=0;}
1100	foreach my $x (0..$NrTrg){$LinkMatrix[$s][$x]=0;}
1101    }
1102    return @Links;
1103}
1104
1105# ------------------------------------
1106# competitive linking from target to source
1107
1108sub competitiveTrgLinks{
1109    my $self=shift;
1110    my $MinScore=shift;                 # score threshold
1111
1112    my $LinkProb=$self->{matrix};
1113    my $Token=$self->{token};
1114    my $NrSrc=$#{$$Token{source}};
1115    my $NrTrg=$#{$$Token{target}};
1116    my @Links=();
1117
1118    # ----------------------------
1119    my @LinkMatrix=();
1120    $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
1121    # ----------------------------
1122
1123    my ($s,$t);
1124    while (($s,$t)=$self->getTopLink(\@LinkMatrix,$MinScore)){
1125	if ($LinkMatrix[$s][$t]<$MinScore){next;}
1126	$LinkMatrix[$s][$t]=0;
1127
1128	$Links[$t]=$s;
1129
1130	foreach my $x (0..$NrSrc){$LinkMatrix[$x][$t]=0;}
1131	foreach my $x (0..$NrTrg){$LinkMatrix[$s][$x]=0;}
1132    }
1133    return @Links;
1134}
1135
1136
1137#==========================================================================
1138#
1139# get the word-to-word link with the highest score from the clue matrix
1140#
1141#==========================================================================
1142
1143sub getTopLink{
1144    my $self=shift;
1145    my $LinkProb=shift;
1146    my $MinScore=shift;
1147
1148    my $bestX=undef;
1149    my $bestY=undef;
1150    my $bestVal;
1151
1152    if (not ref($self->{SORTEDLINKS})){
1153	$self->sortLinks($LinkProb,$MinScore);
1154    }
1155    my $top=shift @{$self->{SORTEDLINKS}};
1156    if (not defined $top){
1157	delete $self->{SORTEDLINKS};
1158    }
1159    my @link=split (':',$top);
1160    return @link;
1161}
1162
1163sub sortLinks{
1164    my $self=shift;
1165    my $LinkProb=shift;
1166    my $MinScore=shift;
1167    $self->{ALLLINKS}={};
1168    foreach my $x (0..$#{$LinkProb}){
1169	foreach my $y (0..$#{$$LinkProb[$x]}){
1170	    if ($$LinkProb[$x][$y]<$MinScore){next;}
1171	    if ($$LinkProb[$x][$y]<=0){next;}
1172	    $self->{ALLLINKS}->{"$x:$y"}=$$LinkProb[$x][$y];
1173	}
1174    }
1175    @{$self->{SORTEDLINKS}}=
1176	sort {$self->{ALLLINKS}->{$b} <=> $self->{ALLLINKS}->{$a}} 
1177    keys %{$self->{ALLLINKS}};
1178}
1179
1180sub getTopLinkOld{
1181    my $self=shift;
1182    my $LinkProb=shift;
1183
1184    my $bestX=undef;
1185    my $bestY=undef;
1186    my $bestVal;
1187
1188    foreach my $x (0..$#{$LinkProb}){
1189	my @sort = sort {$$LinkProb[$x][$b] <=> $$LinkProb[$x][$a]} 
1190	                (0..$#{$$LinkProb[$x]});
1191	if ($$LinkProb[$x][$sort[0]]>$bestVal){
1192	    $bestVal=$$LinkProb[$x][$sort[0]];
1193	    $bestX="$x";
1194	    $bestY="$sort[0]";
1195	}
1196    }
1197    if ((defined $bestX) and (defined $bestY)){
1198	return ($bestX,$bestY);
1199    }
1200    else{
1201	return ();
1202    }
1203}
1204
1205#==========================================================================
1206#
1207# getClusterLinks:
1208#    make word/phrase links out of link clusters
1209#    (add all necessary information for storing links, 
1210#     e.g. token pairs, id's, byte spans)
1211#
1212#==========================================================================
1213
1214
1215sub getClusterLinks{
1216    my $self=shift;
1217    my $LinkCluster=shift;
1218    my $Links=shift;
1219
1220    my $LinkProb=$self->{matrix};
1221    my $TokenAttr=$self->{tokenAttr};
1222
1223    if (ref($Links) ne 'HASH'){$Links={};}
1224
1225    foreach (0..$#{$LinkCluster}){
1226	if (keys %{$$LinkCluster[$_]{src}}){
1227	    if (keys %{$$LinkCluster[$_]{trg}}){
1228		my $src=join ':',sort {$a<=>$b} keys %{$$LinkCluster[$_]{src}};
1229		my $trg=join ':',sort {$a<=>$b} keys %{$$LinkCluster[$_]{trg}};
1230		my $score=$self->getMatrixScore($LinkProb,
1231						$$LinkCluster[$_]{src},
1232						$$LinkCluster[$_]{trg});
1233		my $link=$self->getLinkString($TokenAttr,$src,$trg);
1234
1235		$$Links{$src}{link}=$link;
1236		$$Links{$src}{source}=
1237		    $self->ngramIDs($src,$TokenAttr,'source');
1238		$$Links{$src}{target}=
1239		    $self->ngramIDs($trg,$TokenAttr,'target');
1240#		my $span=$self->ngramSpans($src,$TokenAttr,'source');
1241#		if ($span){$$Links{$src}{src}=$span;}
1242#		$span=$self->ngramSpans($trg,$TokenAttr,'target');
1243#		if ($span){$$Links{$src}{trg}=$span;}
1244		$$Links{$src}{score}=$score;
1245	    }
1246	}
1247    }
1248    return $Links;
1249}
1250
1251
1252sub getMatrixScore{
1253    my $self=shift;
1254    my ($matrix,$src,$trg)=@_;
1255    my $score=0;
1256    my $count;
1257    foreach my $s (keys %{$src}){
1258	foreach my $t (keys %{$trg}){
1259	    if ($$matrix[$s][$t]>0){
1260		$score+=log($$matrix[$s][$t]);
1261		$count++;
1262	    }
1263	}
1264    }
1265    if ($count){
1266	$score/=$count;
1267    }
1268    return exp($score);
1269}
1270
1271
1272#==========================================================================
1273#
1274# add links to link clusters
1275#
1276#==========================================================================
1277
1278sub add2LinkCluster{
1279    my $self=shift;
1280    my ($x,$y,$cluster)=@_;
1281    my @overlap=$self->findClusterOverlap($x,$y,$cluster);
1282    if ((not $self->parameter('allow_multiple_overlaps')) and (@overlap>1)){
1283#	print STDERR "disregard $x - $y (multi-overlap)!\n";
1284	return 0;
1285    }
1286    elsif (@overlap){
1287	if ($self->parameter('in_phrases_only')){
1288	    if ($self->parameter('fill_phrases')){
1289		if (not $self->fillPhrases($x,$y,$cluster,$overlap[0])){
1290#		    print STDERR "disregard $x - $y (fill phrase)!\n";
1291		    return 0;
1292		}
1293	    }
1294	    if (not $self->isInPhrase($x,$y,$$cluster[$overlap[0]])){
1295#		print STDERR "disregard $x - $y (not in phrase)!\n";
1296		return 0;
1297	    }
1298	}
1299	if ($self->parameter('adjacent_only')){
1300	    if (not $self->isAdjacent($x,$y,$$cluster[$overlap[0]])){
1301#		print STDERR "disregard $x - $y (not adjacent)!\n";
1302		return 0;
1303	    }
1304	}
1305	if ($self->parameter('adjacent_score')){
1306	    if (not $self->isAdjacentScore($x,$y,$$cluster[$overlap[0]],
1307				      $self->parameter('adjacent_score'))){
1308#s		print STDERR "disregard $x - $y (score difference to adjacent too big)!\n";
1309		return 0;
1310	    }
1311	}
1312	$$cluster[$overlap[0]]{src}{$x}=1;
1313	$$cluster[$overlap[0]]{trg}{$y}=1;
1314	if (@overlap>1){                              # join all overlapping
1315	    foreach my $o (1..$#overlap){             # link clusters!
1316		foreach (keys %{$$cluster[$overlap[$o]]{src}}){
1317		    delete $$cluster[$overlap[$o]]{src}{$_};
1318		    $$cluster[$overlap[0]]{src}{$_}=1;
1319		}
1320		foreach (keys %{$$cluster[$overlap[$o]]{trg}}){
1321		    delete $$cluster[$overlap[$o]]{trg}{$_};
1322		    $$cluster[$overlap[0]]{trg}{$_}=1;
1323		}
1324	    }
1325	}
1326    }
1327    else{
1328	$$cluster[$#{$cluster}+1]={};
1329	$$cluster[-1]{src}{$x}=1;
1330	$$cluster[-1]{trg}{$y}=1;
1331    }
1332    return 1;
1333}
1334
1335
1336sub isInPhrase{
1337    my $self=shift;
1338    my ($newX,$newY,$cluster)=@_;
1339    my @srcAccepted=keys %{$self->{srcToken}};
1340    my @trgAccepted=keys %{$self->{trgToken}};
1341
1342    my %src=%{$cluster->{src}};
1343    my %trg=%{$cluster->{trg}};
1344    $src{$newX}=1;
1345    $trg{$newY}=1;
1346
1347#    my $srcPhr=join ':',sort {$a <=> $b} keys %src;
1348#    my $trgPhr=join ':',sort {$a <=> $b} keys %trg;
1349
1350    my $srcPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %src;
1351    my $trgPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %trg;
1352
1353    if (grep(/$srcPhr/,@srcAccepted)){
1354	if (grep(/$trgPhr/,@trgAccepted)){
1355#	    my @missing=$self->getMissingTokens(\%src,\%trg);
1356	    return 1;
1357	}
1358    }
1359    return 0;
1360}
1361
1362sub fillPhrases{
1363    my $self=shift;
1364    my ($newX,$newY,$cluster,$nr)=@_;
1365
1366    my %link=();
1367    %{$link{src}}=%{$cluster->[$nr]->{src}};
1368    %{$link{trg}}=%{$cluster->[$nr]->{trg}};
1369    $link{src}{$newX}=1;
1370    $link{trg}{$newY}=1;
1371
1372    my @missing=$self->getMissingTokens($link{src},$link{trg});
1373    if (not @missing){
1374	return 0;
1375    }
1376    my @missSrc=split(/:/,$missing[0]);
1377    my @missTrg=split(/:/,$missing[1]);
1378    my %overlap=();
1379    foreach my $s (@missSrc){
1380	$self->findSrcOverlap($s,$cluster,\%overlap);
1381	$link{src}{$s}=1;
1382    }
1383    foreach my $t (@missTrg){
1384	$self->findTrgOverlap($t,$cluster,\%overlap);
1385	$link{trg}{$t}=1;
1386    }
1387    foreach (keys %overlap){
1388	if (not $self->isIncluded($cluster->[$_],\%link)){
1389	    foreach (@missSrc){delete $link{src}{$_};}
1390	    foreach (@missTrg){delete $link{trg}{$_};}
1391	    return 0;
1392	}
1393
1394#############  !!!!!!!!!!!!!! change this:
1395	print STDERR "delete cluster $_!\n";
1396	$cluster->[$_]->{src}=();   
1397	$cluster->[$_]->{trg}=();
1398#############  !!!!!!!!!!!!!! change this:
1399
1400    }
1401
1402    if (@missSrc or @missTrg){                  # ... just for information
1403	print STDERR "fill cluster $nr with missing tokens!\n";
1404    }
1405
1406    foreach (keys %{$link{src}}){
1407	$cluster->[$nr]->{src}->{$_}=1;
1408    }
1409    foreach (keys %{$link{trg}}){
1410	$cluster->[$nr]->{trg}->{$_}=1;
1411    }
1412    return 1;
1413}
1414
1415
1416#sub removeClusterInclusions{
1417#    my $self=shift;
1418#    my $cluster=shift;
1419#    foreach my $c (@{$cluster}){
1420#	my $src=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$$cluster[$c]{src}};
1421#	my $trg=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$$cluster[$c]{trg}};
1422#    }
1423#}
1424
1425
1426sub isIncluded{
1427    my $self=shift;
1428    my ($cluster1,$cluster2)=@_;
1429    foreach (keys %{$cluster1->{src}}){
1430	if (not defined $cluster2->{src}->{$_}){return 0;}
1431    }
1432    foreach (keys %{$cluster1->{trg}}){
1433	if (not defined $cluster2->{trg}->{$_}){return 0;}
1434    }
1435    return 1;
1436}
1437
1438sub findSrcOverlap{
1439    my $self=shift;
1440    return $self->findOverlap('src',@_);
1441}
1442sub findTrgOverlap{
1443    my $self=shift;
1444    return $self->findOverlap('trg',@_);
1445}
1446
1447sub findOverlap{
1448    my $self=shift;
1449    my ($lang,$token,$cluster,$overlap)=@_;
1450    my @c=grep (defined $$cluster[$_]{$lang}{$token},0..$#{$cluster});
1451    foreach (@c){
1452	$$overlap{$_}=1;
1453    }
1454}
1455
1456sub getMissingTokens{
1457    my $self=shift;
1458    my ($src,$trg)=@_;
1459    my @srcAccepted=keys %{$self->{srcToken}};
1460    my @trgAccepted=keys %{$self->{trgToken}};
1461
1462    my $srcPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$src};
1463    my $trgPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$trg};
1464
1465    my $missingSrc=undef;
1466    my $missingTrg=undef;
1467
1468    my @match;
1469
1470    if (@match=grep(/$srcPhr/,@srcAccepted)){
1471
1472	@match=sort {length($a) <=> length($b)} @match;
1473	if ($match[0]=~/^(.*)$srcPhr(.*)$/){
1474	    $missingSrc="$1$2$3$4$5$6$7$8$9";
1475	}
1476
1477	if (@match=grep(/$trgPhr/,@trgAccepted)){
1478
1479	    @match=sort {length($a) <=> length($b)} @match;
1480	    if ($match[0]=~/^(.*)$trgPhr(.*)$/){
1481		$missingTrg="$1$2$3$4$5$6$7$8$9";
1482	    }
1483	    $missingSrc=~s/^://;$missingSrc=~s/:$//;
1484	    $missingTrg=~s/^://;$missingTrg=~s/:$//;
1485	    return ($missingSrc,$missingTrg);
1486	}
1487    }
1488
1489    return ();
1490}
1491
1492
1493sub isAdjacent{
1494    my $self=shift;
1495    my ($x,$y,$cluster)=@_;
1496    if ((defined $$cluster{src}{$x}) and
1497	((defined $$cluster{trg}{$y-1}) or
1498	 ((defined $$cluster{trg}{$y+1})))){
1499	return 1;
1500    }
1501    if ((defined $$cluster{trg}{$y}) and
1502	((defined $$cluster{src}{$x-1}) or
1503	 ((defined $$cluster{src}{$x+1})))){
1504	return 1;
1505    }
1506    return 0;
1507}
1508
1509sub isAdjacentScore{
1510    my $self=shift;
1511    my ($x,$y,$cluster,$p)=@_;
1512
1513    if ((defined $$cluster{src}{$x}) and
1514	(defined $$cluster{trg}{$y-1})){
1515	if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x]->[$y-1]*$p){
1516	    return 1;
1517	}
1518	return 0;
1519    }
1520    if ((defined $$cluster{src}{$x}) and
1521	(defined $$cluster{trg}{$y+1})){
1522	if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x]->[$y+1]*$p){
1523	    return 1;
1524	}
1525	return 0;
1526    }
1527    if ((defined $$cluster{src}{$x-1}) and
1528	(defined $$cluster{trg}{$y})){
1529	if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x-1]->[$y]*$p){
1530	    return 1;
1531	}
1532	return 0;
1533    }
1534    if ((defined $$cluster{src}{$x+1}) and
1535	(defined $$cluster{trg}{$y})){
1536	if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x+1]->[$y]*$p){
1537	    return 1;
1538	}
1539	return 0;
1540    }
1541    return 0;
1542}
1543
1544
1545sub findClusterOverlap{
1546    my $self=shift;
1547    my ($x,$y,$cluster)=@_;
1548    my @overlap=();
1549    foreach (0..$#{$cluster}){
1550	if (defined $$cluster[$_]{src}{$x}){
1551	    push(@overlap,$_);
1552	}
1553	elsif (defined $$cluster[$_]{trg}{$y}){
1554	    push(@overlap,$_);
1555	}
1556    }
1557    return @overlap;
1558}
1559
1560
1561
1562#========================================================================
1563
1564sub cloneLinkMatrix{
1565    my $self=shift;
1566    my $matrix=shift;
1567    my $clone=shift;
1568
1569    if (ref($matrix) ne 'ARRAY'){return ();}
1570    if (ref($clone) ne 'ARRAY'){$clone=[];}
1571
1572    foreach my $x (0..$#{$matrix}){
1573	foreach my $y (0..$#{$$matrix[$x]}){
1574	    $$clone[$x][$y]=$$matrix[$x][$y];
1575	}
1576    }
1577    return $clone;
1578}
1579
1580
1581
1582
1583#==========================================================================
1584#
1585#
1586#
1587#==========================================================================
1588
1589
1590
1591sub clueMatrixToHtml{
1592    my $self=shift;
1593
1594    my $Matrix=$self->{matrix};
1595    my $Token=$self->{token};
1596    my $SrcTok=$$Token{source};
1597    my $TrgTok=$$Token{target};
1598    my $nrSrc=$#{$$Token{source}};
1599    my $nrTrg=$#{$$Token{target}};
1600
1601    my $max;
1602    foreach my $s (0..$nrSrc){
1603        foreach my $t (0..$nrTrg){
1604            if ($Matrix->[$s]->[$t]>$max){$max=$Matrix->[$s]->[$t];}
1605        }
1606    }
1607    if (not $max){$max=1;}
1608
1609    my $html="<p>\n";
1610    $html.="<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n";
1611    $html.="<tr><th></th>\n";
1612
1613    foreach my $t (0..$nrTrg){
1614        my $str=$TrgTok->[$t];
1615        $html.="<th>$str</th>\n";
1616    }
1617
1618    foreach my $s (0..$nrSrc){
1619        $html.="</tr><tr>\n";
1620        my $str=$SrcTok->[$s];
1621        $html.="<th>$str</th>\n";
1622        foreach my $t (0..$nrTrg){
1623            my $score=0;
1624            if ($Matrix->[$s]){
1625                if ($Matrix->[$s]->[$t]){
1626                    $score=$Matrix->[$s]->[$t];
1627                }
1628            }
1629            my $color=255-$score*256/$max;
1630            if ($color==-1){$color=0;}
1631	    my $hex=sprintf("%X",$color);
1632	    if (length($hex)<2){$hex="0$hex";}
1633	    my $val=int(100*$score);
1634            if ($color<128){
1635		$html.="<td bgcolor=\"#$hex$hex$hex\">";
1636		$html.='<font color="#ffffff">';
1637		$html.="$val</font></td>\n";
1638	    }
1639	    else{
1640		$html.="<td bgcolor=\"#$hex$hex$hex\">";
1641                $html.="$val</td>\n";
1642	    }
1643        }
1644    }
1645    $html.="</tr></table><hr>\n";
1646    return $html;
1647}
1648
1649
1650
1651sub printHtmlClueMatrix{
1652    my $self=shift;
1653    print STDERR $self->clueMatrixToHtml();
1654}
1655
1656
1657
1658sub printClueMatrix{
1659    my $self=shift;
1660
1661    my ($SrcTok,$TrgTok,$Matrix)=@_;
1662
1663    my $nrSrc=$#{$SrcTok};
1664    my $nrTrg=$#{$TrgTok};
1665
1666
1667    print STDERR "\n=====================================================\n";
1668    print STDERR "final clue matrix scores";
1669    print STDERR "\n=====================================================\n";
1670
1671    foreach my $s (0..$nrSrc){
1672	foreach my $t (0..$nrTrg){
1673	    my $score=$Matrix->[$s]->[$t];
1674	    if ($score>0){
1675#		printf STDERR "[%2d-%-2d] %15s - %-15s: %s\n",
1676		printf STDERR "[%d %d] %20s - %-20s %s\n",
1677		$s,$t,$$SrcTok[$s],$$TrgTok[$t],$score;
1678	    }
1679	}
1680    }
1681    print STDERR "\n=====================================================\n";
1682    print STDERR "clue matrix $nrSrc x $nrTrg";
1683    print STDERR "\n=====================================================\n";
1684
1685    my @char=();
1686    &MakeCharArr($TrgTok,\@char);
1687    foreach my $c (0..$#char){
1688	printf STDERR "\n%10s",' '; 
1689	foreach (@{$char[$c]}){
1690	    printf STDERR "%4s",$_;
1691	}
1692    }
1693
1694    print STDERR "\n";
1695
1696    foreach my $s (0..$nrSrc){
1697	my $str=substr($SrcTok->[$s],0,10);
1698	$str=&Uplug::Encoding::convert($str,'utf-8','iso-8859-1');
1699
1700	printf STDERR "%10s",$str; 
1701	foreach my $t (0..$nrTrg){
1702	    my $score=0;
1703	    if ($Matrix->[$s]){
1704		if ($Matrix->[$s]->[$t]){
1705		    $score=$Matrix->[$s]->[$t];
1706		}
1707	    }
1708	    printf STDERR " %3d",$score*100;
1709	}
1710	print STDERR "\n";
1711    }
1712}
1713
1714
1715sub MakeCharArr{
1716    my ($tok,$char)=@_;
1717
1718    my @lat1=@{$tok};
1719
1720#    my @lat1=();
1721#    foreach (0..$#{$tok}){
1722#	$lat1[$_]=&Uplug::Data::encode($tok->[$_],'utf-8','iso-8859-1');
1723#    }
1724
1725    map ($lat1[$_]=&Uplug::Encoding::convert($lat1[$_],'utf-8','iso-8859-1'),
1726	 (0..$#lat1));
1727
1728    my $max=&MaxLength(\@lat1);
1729    foreach my $t (0..$#{$tok}){
1730	my @c=split(//,$lat1[$t]);
1731	foreach (1..$max){
1732	    if (@c){
1733		$char->[$max-$_]->[$t]=pop(@c);
1734#		$char->[$max-$_]->[$t]=shift(@c);
1735	    }
1736	    else{$char->[$max-$_]->[$t]=' ';}
1737	}
1738    }
1739}
1740
1741sub MaxLength{
1742    my ($tok)=@_;
1743    my $max=0;
1744    foreach (@{$tok}){
1745	if (length($_)>$max){$max=length($_);}
1746    }
1747    return $max;
1748}
1749
1750
1751
1752
1753
1754######### return a true value
1755
17561;
1757