PageRenderTime 49ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/uplug-main/lib/Uplug/Align/Word/Clue.pm

https://bitbucket.org/tiedemann/uplug
Perl | 1757 lines | 1127 code | 335 blank | 295 comment | 143 complexity | cce623150dbedaaae122e96aa1b531ca MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. #-*-perl-*-
  2. ####################################################################
  3. # Copyright (C) 2004 Jörg Tiedemann <joerg@stp.ling.uu.se>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. #
  19. ###########################################################################
  20. # Uplug::Align::Word::Clue
  21. #
  22. #
  23. #
  24. ###########################################################################
  25. package Uplug::Align::Word::Clue;
  26. use strict;
  27. # use Time::HiRes qw(time);
  28. use vars qw(@ISA $DEBUG);
  29. use vars qw($INPHRASESONLY $ADJACENTONLY $ADJACENTSCORE $FILLPHRASES);
  30. use vars qw($ALLOWMULTIOVERLAP $PRINTHTML);
  31. # use utf8;
  32. use Uplug::Data;
  33. use Uplug::Align::Word;
  34. use Data::Dumper;
  35. $Data::Dumper::Indent=1;
  36. $Data::Dumper::Terse=1;
  37. $Data::Dumper::Purity=1;
  38. @ISA = qw( Uplug::Align::Word );
  39. $DEBUG = 0;
  40. #---------------------------------
  41. # parameters for add2LinkCluster
  42. $INPHRASESONLY = 0; # if = 1 --> no links outside of chunks
  43. $ADJACENTONLY = 0; # if = 1 --> allow only adjacent links
  44. $ADJACENTSCORE = 0; # if > 0 --> $score >= $neighbor * $ADJACENTSCORE
  45. # $ALLOWMULTIOVERLAP = 0; # allow overlap with more than 1 link cluster!
  46. # $ADJACENTSCORE = 0.4;
  47. $ADJACENTSCORE = 0;
  48. $FILLPHRASES = 0; # ... doesn't work ....
  49. sub new{
  50. my $class=shift;
  51. my $self=$class->SUPER::new(@_);
  52. if (not defined $self->parameter('adjacent_only')){
  53. $self->setParameter('adjacent_only',$ADJACENTONLY);
  54. }
  55. if (not defined $self->parameter('adjacent_score')){
  56. $self->setParameter('adjacent_score',$ADJACENTSCORE);
  57. }
  58. if (not defined $self->parameter('in_phrases_only')){
  59. $self->setParameter('in_phrases_only',$INPHRASESONLY);
  60. }
  61. if (not defined $self->parameter('fill_phrase')){
  62. $self->setParameter('fill_phrases',$FILLPHRASES);
  63. }
  64. if (not defined $self->parameter('allow_multiple_overalps')){
  65. $self->setParameter('allow_multiple_overlaps',$ALLOWMULTIOVERLAP);
  66. }
  67. if (not defined $self->parameter('verbose')){
  68. $self->setParameter('verbose',$DEBUG);
  69. }
  70. else{$DEBUG=$self->parameter('verbose');}
  71. return $self;
  72. }
  73. sub DESTROY {
  74. my $self = shift;
  75. }
  76. #===========================================================================
  77. #
  78. # get all link scores and fill the clue matrix
  79. #
  80. #===========================================================================
  81. sub getLinkScores{
  82. my $self=shift;
  83. $self->{matrix}=[];
  84. $self->{links}={};
  85. my $LinkProb=$self->{matrix};
  86. my $links=$self->{linkStreams};
  87. my $SrcTok=$self->{srcToken};
  88. my $TrgTok=$self->{trgToken};
  89. my $Param=$self->{param};
  90. my $data=$self->{data};
  91. ## prepare clue param hash (reduce checks in the loop below)
  92. my %ClueParam=%{$Param};
  93. if (exists $ClueParam{general}){delete $ClueParam{general};}
  94. if (exists $ClueParam{original}){delete $ClueParam{original};}
  95. foreach (keys %ClueParam){
  96. if (ref($ClueParam{$_}) ne 'HASH'){$ClueParam{$_}={};}
  97. if (not defined $ClueParam{$_}{'score weight'}){
  98. $ClueParam{$_}{'score weight'}=$self->defaultClueWeight();
  99. }
  100. }
  101. ## define some variables used in the loop
  102. my $weight; # clue combination weight
  103. my ($src,$trg); # source and target language tokens
  104. my $score; # clue score found for the current pair
  105. my $time; # time (for debugging)
  106. my %search; # hash of patterns for searching clues
  107. my $found=Uplug::Data->new; # clues found
  108. my @SrcTok; # positions of the current source
  109. my @TrgTok; # and target tokens in the sentence
  110. my ($s,$t,$x,$y,$p); # variables for iteration
  111. my $ScoreComb=$self->parameter('score combination');
  112. if (not $ScoreComb){$ScoreComb='probabilistic';}
  113. if ($self->parameter('verbose')){
  114. print STDERR "\n=====================================================\n";
  115. print STDERR "matching clue scores";
  116. print STDERR "\n=====================================================\n";
  117. }
  118. ## the following loop takes most of the time!
  119. foreach $s (sort {$a <=> $b} keys %{$SrcTok}){
  120. foreach $t (keys %{$TrgTok}){
  121. $time=time();
  122. ($src,$trg)=($$SrcTok{$s}{general},$$TrgTok{$t}{general});
  123. $self->alignIdentical($src,$trg,$s,$t,$LinkProb);
  124. ### DEBUG: store search time
  125. $self->{identical_score_time}+=time()-$time if ($DEBUG);
  126. foreach (keys %ClueParam){
  127. $time=time();
  128. $weight=$ClueParam{$_}{'score weight'};
  129. if ($ClueParam{$_}{'relative position'}){
  130. ($src,$trg)=$self->makeRelPosFeature($$SrcTok{$s}{$_},
  131. $$TrgTok{$t}{$_});
  132. }
  133. else{($src,$trg)=($$SrcTok{$s}{$_},$$TrgTok{$t}{$_});}
  134. ### DEBUG: store search time
  135. $self->{before_score_time}+=time()-$time if ($DEBUG);
  136. $score=0;
  137. #---------------------------------------
  138. # length difference as scores ...
  139. #---------------------------------------
  140. if ($ClueParam{$_}{'string length difference'}){
  141. $score=$data->lengthQuotient($src,$trg);
  142. }
  143. #---------------------------------------
  144. # otherwise: search scores in link-DB
  145. #---------------------------------------
  146. else{
  147. if (not defined $links->{$_}){next;}
  148. if (defined($src) and defined($trg)){
  149. %search=('source' => $src,
  150. 'target' => $trg);
  151. $time=time();
  152. if ($links->{$_}->select($found,\%search)){
  153. $score=$found->attribute('score');
  154. }
  155. ### DEBUG: store search time
  156. $self->{search_score_time}+=time()-$time if ($DEBUG);
  157. }
  158. }
  159. $time=time();
  160. #---------------------------------------
  161. # set weighted score in score matrix
  162. #---------------------------------------
  163. if (not $score){next;}
  164. if (not $data->checkPairParameter($src,$trg,$ClueParam{$_})){
  165. ### DEBUG: store search time
  166. $self->{after_score_time}+=time()-$time if ($DEBUG);
  167. next;
  168. }
  169. if (exists $ClueParam{$_}{'minimal score'}){
  170. if ($score<$ClueParam{$_}{'minimal score'}){
  171. ### DEBUG: store search time
  172. $self->{after_score_time}+=time()-$time if ($DEBUG);
  173. next;
  174. }
  175. }
  176. $score*=$weight;
  177. # shouldn't be >1, but in case ...
  178. #--------------------------------
  179. if ($score>=1){$score=0.999999999999;}
  180. #--------------------------------
  181. if ($self->parameter('verbose')){
  182. # printf STDERR "%20s [ %s %s ] %15s - %-15s %s\n",
  183. printf STDERR "$_\t$s\t$t\t$src\t$trg\t$score\n";
  184. }
  185. @SrcTok=split(/:/,$s);
  186. @TrgTok=split(/:/,$t);
  187. foreach $x (@SrcTok){
  188. foreach $y (@TrgTok){
  189. # if ($self->parameter('verbose')){
  190. # printf STDERR "%20s [%d %d] %15s - %-15s %s\n",
  191. # $_,$x,$y,$src,$trg,$score;
  192. # }
  193. if ($ScoreComb eq 'addition'){
  194. $$LinkProb[$x][$y]+=$score;
  195. }
  196. #
  197. # log-linear and multiplication are useless!
  198. # * there's not always a positive score for each possible pair!
  199. # --> multiplications with one factor = 0 --> score = 0
  200. # --> leaving out zero-factors -> implicit penalty for pairs with multiple
  201. # clue scores
  202. #
  203. # elsif ($ScoreComb eq 'log-linear'){
  204. # $$LinkProb[$x][$y]+=log($score);
  205. # }
  206. # elsif ($ScoreComb eq 'multiplication'){
  207. # $$LinkProb[$x][$y]+=log($score);
  208. # }
  209. else{
  210. $p=$$LinkProb[$x][$y];
  211. $$LinkProb[$x][$y]=$p+$score-$p*$score;
  212. }
  213. }
  214. }
  215. ### DEBUG: store search time
  216. $self->{after_score_time}+=time()-$time if ($DEBUG);
  217. }
  218. }
  219. }
  220. $time=time();
  221. $self->align1x($LinkProb);
  222. # if ($ScoreComb eq 'log-linear'){ # special for log-linear:
  223. # foreach $x (0..$#{$LinkProb}){ # reverse log (make positiv
  224. # foreach $y (0..$#{$$LinkProb[$x]}){ # score values)
  225. # $$LinkProb[$x][$y]=exp($$LinkProb[$x][$y]);
  226. # }
  227. # }
  228. # }
  229. if ($self->parameter('verbose')){
  230. $self->printClueMatrix($self->{token}->{source},
  231. $self->{token}->{target},
  232. $self->{matrix});
  233. $self->printBitextTokensWithID();
  234. # $self->printBitextToken($self->{token}->{source},
  235. # $self->{token}->{target});
  236. }
  237. ### DEBUG: store search time
  238. $self->{'1x_score_time'}+=time()-$time if ($DEBUG);
  239. }
  240. #===========================================================================
  241. #
  242. # search for the best alignment using the clue matrix scores
  243. #
  244. # topLinkSearch ........ iteratively add top links to link clusters
  245. # nextBestSearch ....... score = distance to next best link (+topLinkSearch)
  246. # oneOneFirstSearch .... non-overlapping first, overlapping then
  247. # competitiveSearch .... competitive linking (1:1 links only!)
  248. # bidirectionalRefineSearch intersection of directional links + overlapping
  249. # directionalSrcSearch ..... best alignment source --> target
  250. # directionalTrgSearch ..... best alignment target --> source
  251. # bidirectionalUnion ....... union of directionalSrc & directionalTrg
  252. # bidirectionalIntersection intersection of directionalSrc & directionalTrg
  253. #
  254. # parameter search: nextbest ........ nextBestSearch
  255. # oneone....... ... oneOneFirstSearch
  256. # competitive ..... competitiveSearch
  257. # myrefined ....... bidirectionalRefinedSearch
  258. # och ............. bidirectionalRefinedSearchOch
  259. # src ............. directionalSrcSearch
  260. # trg ............. directionalTrgSearch
  261. # union ........... bidirectionalUnion
  262. # intersection .... bidirectionalIntersection
  263. # <default> ....... topLinkSearch
  264. #
  265. #===========================================================================
  266. sub findAlignment{
  267. my $self=shift;
  268. $self->{links}={};
  269. my $minScore=$self->scoreThreshold();
  270. my $method=$self->parameter('search');
  271. if ($method=~/nextbest/){
  272. return $self->nextBestSearch($self->{links},$minScore);}
  273. elsif ($method=~/competitive/){
  274. return $self->competitiveSearch($self->{links},$minScore);}
  275. elsif ($method=~/oneone/){
  276. return $self->oneOneFirstSearch($self->{links},$minScore);}
  277. elsif ($method=~/myrefined/){
  278. return $self->bidirectionalRefinedSearch($self->{links},$minScore);}
  279. elsif ($method=~/(och|refined)/){
  280. return $self->bidirectionalRefinedSearchOch($self->{links},$minScore);}
  281. elsif ($method=~/src/){
  282. return $self->directionalSrcSearch($self->{links},$minScore);}
  283. elsif ($method=~/trg/){
  284. return $self->directionalTrgSearch($self->{links},$minScore);}
  285. elsif ($method=~/union/){
  286. return $self->bidirectionalUnion($self->{links},$minScore);}
  287. elsif ($method=~/intersection/){
  288. return $self->bidirectionalIntersection($self->{links},$minScore);}
  289. else{
  290. return $self->topLinkSearch($self->{links},$minScore);}
  291. }
  292. #===========================================================================
  293. # add scores to the clue matrix for
  294. # sentence alignments with only 1 word in either source or target
  295. #===========================================================================
  296. sub align1x{
  297. my $self=shift;
  298. my ($LinkProb)=@_;
  299. my $Align11=$self->parameter('align 1:1');
  300. my $Align1x=$self->parameter('align 1:x');
  301. if ($Align11 and ($#{$LinkProb}==0)){
  302. if ($#{$$LinkProb[0]}==0){
  303. my $p=$$LinkProb[0][0];
  304. $$LinkProb[0][0]=$p+$Align11-$p*$Align11;
  305. return;
  306. }
  307. }
  308. if ($Align1x and ($#{$LinkProb}==0)){
  309. foreach (0..$#{$$LinkProb[0]}){
  310. my $p=$$LinkProb[0][$_];
  311. $$LinkProb[0][$_]=$p+$Align1x-$p*$Align1x;
  312. }
  313. return;
  314. }
  315. if ($Align1x){
  316. my $ok=1;
  317. foreach (0..$#{$LinkProb}){
  318. if ($#{$$LinkProb[$_]}!=0){$ok=0;}
  319. }
  320. if ($ok){
  321. foreach (0..$#{$LinkProb}){
  322. my $p=$$LinkProb[$_][0];
  323. $$LinkProb[$_][0]=$p+$Align1x-$p*$Align1x;
  324. }
  325. }
  326. }
  327. }
  328. #===========================================================================
  329. # add scores to the clue matrix for
  330. # pairs of identical tokens with at least one non-alphabetical character
  331. # (hard-coded as /[^A-Za-z]/ !!!!!!)
  332. #===========================================================================
  333. sub alignIdentical{
  334. my $self=shift;
  335. my $AlignIdentical=$self->parameter('align identical');
  336. if (not $AlignIdentical){return;}
  337. my ($src,$trg,$s,$t,$LinkProb)=@_;
  338. if ($src=~/[^A-Za-z]/){
  339. if ($src eq $trg){
  340. my @SrcTok=split(/:/,$s);
  341. my @TrgTok=split(/:/,$t);
  342. foreach my $x (@SrcTok){
  343. foreach my $y (@TrgTok){
  344. my $p=$$LinkProb[$x][$y];
  345. $$LinkProb[$x][$y]=$p+$AlignIdentical-$p*$AlignIdentical;
  346. }
  347. }
  348. }
  349. }
  350. }
  351. #===========================================================================
  352. #
  353. # topLinkSearch:
  354. # 1) search best link in the matrix
  355. # 2) add link to link clusters
  356. # 3) continue with 1) until finished
  357. #
  358. #===========================================================================
  359. sub topLinkSearch{
  360. my $self=shift;
  361. my $Links=shift;
  362. my $MinScore=shift;
  363. my $LinkProb=$self->{matrix};
  364. my $Token=$self->{token};
  365. my $TokenAttr=$self->{tokenAttr};
  366. my @SrcLinks;
  367. my @TrgLinks;
  368. my $NrSrc=$#{$$Token{source}};
  369. my $NrTrg=$#{$$Token{target}};
  370. my @LinkMatrix;
  371. my @LinkCluster;
  372. my ($x,$y);
  373. # ----------------------------
  374. # print STDERR "---------new sentence-------$MinScore-------\n";
  375. undef $self->{SORTEDLINKS};
  376. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix); # clone the matrix
  377. while (($x,$y)=$self->getTopLink(\@LinkMatrix,$MinScore)){
  378. # print STDERR "$x:$y:$LinkMatrix[$x][$y]\n";
  379. if ($MinScore=~/\%/){
  380. $MinScore=$LinkMatrix[$x][$y]*$MinScore/100;
  381. # print STDERR "## minscore == $MinScore\n";
  382. }
  383. if (not defined($LinkMatrix[$x][$y])){last;}
  384. if ($LinkMatrix[$x][$y]<$MinScore){last;}
  385. if ($self->add2LinkCluster($x,$y,\@LinkCluster)){
  386. $LinkMatrix[$x][$y]=0;
  387. }
  388. }
  389. # ----------------------------
  390. # get the links from the set of link clusters
  391. $self->getClusterLinks(\@LinkCluster,$Links); # get links
  392. }
  393. #===========================================================================
  394. #
  395. # nextBestSearch:
  396. # 1) find score distance to "next best link" for each word pair
  397. # 2) call topLinkSearch
  398. #
  399. #===========================================================================
  400. sub nextBestSearch{
  401. my $self=shift;
  402. my $LinkProb=$self->{matrix};
  403. $self->nextBestMatrix($LinkProb);
  404. return $self->topLinkSearch(@_);
  405. }
  406. sub nextBestMatrix{
  407. my $self=shift;
  408. my ($LinkProb)=@_;
  409. my @SortedColumns=();
  410. my @SortedRows=();
  411. my $sizeX=$#{$LinkProb};
  412. my $sizeY=$#{$$LinkProb[0]};
  413. foreach my $x (0..$sizeX){
  414. @{$SortedColumns[$x]}=
  415. sort {$$LinkProb[$x][$b] <=> $$LinkProb[$x][$a]} (0..$sizeY);
  416. }
  417. foreach my $y (0..$sizeY){
  418. @{$SortedRows[$y]}=
  419. sort {$$LinkProb[$b][$y] <=> $$LinkProb[$a][$y]} (0..$sizeX);
  420. }
  421. my @LinkMatrix=();
  422. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
  423. my $lowest=0;
  424. foreach my $x (0..$sizeX){
  425. foreach my $y (0..$sizeY){
  426. my $NextBestY=$SortedColumns[$x][0];
  427. if ($NextBestY==$y){$NextBestY=$SortedColumns[$x][1];}
  428. my $NextBestX=$SortedRows[$y][0];
  429. if ($NextBestX==$x){$NextBestX=$SortedRows[$y][1];}
  430. my $NextBest=$LinkMatrix[$NextBestX][$y];
  431. if ($LinkMatrix[$x][$NextBestY]>$NextBest){
  432. $NextBest=$LinkMatrix[$x][$NextBestY];
  433. }
  434. $$LinkProb[$x][$y]-=$NextBest;
  435. if ($$LinkProb[$x][$y]<$lowest){
  436. $lowest=$$LinkProb[$x][$y];
  437. }
  438. }
  439. }
  440. foreach my $x (0..$sizeX){ # normalize!
  441. foreach my $y (0..$sizeY){ # no negative values
  442. $$LinkProb[$x][$y]-=$lowest; # in the matrix!
  443. }
  444. }
  445. if ($self->parameter('verbose')){
  446. $self->printClueMatrix($self->{token}->{source},
  447. $self->{token}->{target},
  448. $LinkProb);
  449. }
  450. }
  451. #===========================================================================
  452. #
  453. # oneOneFirstSearch:
  454. # 1) find all one-to-one word links first (non-overlapping links)
  455. # 2) add iteratively overlapping links
  456. #
  457. #===========================================================================
  458. sub oneOneFirstSearch{
  459. my $self=shift;
  460. my $Links=shift;
  461. my $MinScore=shift;
  462. my $LinkProb=$self->{matrix};
  463. my $Token=$self->{token};
  464. my $TokenAttr=$self->{tokenAttr};
  465. my @SrcLinks;
  466. my @TrgLinks;
  467. my $NrSrc=$#{$$Token{source}};
  468. my $NrTrg=$#{$$Token{target}};
  469. my @LinkMatrix;
  470. my @LinkCluster;
  471. my ($x,$y);
  472. # ----------------------------
  473. # 1) get all word-to-word links without any overlaps
  474. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix); # clone the matrix
  475. while (($x,$y)=$self->getTopLink(\@LinkMatrix,$MinScore)){
  476. if ($MinScore=~/\%/){
  477. $MinScore=$LinkMatrix[$x][$y]*$MinScore/100;
  478. print STDERR "## minscore == $MinScore\n";
  479. }
  480. if ($LinkMatrix[$x][$y]<$MinScore){last;}
  481. my @overlap=$self->findClusterOverlap($x,$y,\@LinkCluster);
  482. if (not @overlap){
  483. $LinkCluster[$#LinkCluster+1]={};
  484. $LinkCluster[-1]{src}{$x}=1;
  485. $LinkCluster[-1]{trg}{$y}=1;
  486. }
  487. $LinkMatrix[$x][$y]=0;
  488. }
  489. # ----------------------------
  490. # 2) do it again --> find overlapping links!
  491. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix); # clone the matrix
  492. while (($x,$y)=$self->getTopLink(\@LinkMatrix,$MinScore)){
  493. if ($LinkMatrix[$x][$y]<$MinScore){last;}
  494. $self->add2LinkCluster($x,$y,\@LinkCluster);
  495. $LinkMatrix[$x][$y]=0;
  496. }
  497. # ----------------------------
  498. # get the links from the set of link clusters
  499. $self->getClusterLinks(\@LinkCluster,$Links); # get links
  500. }
  501. #===========================================================================
  502. # ------------------ directional alignment (source to target) ----------------
  503. #===========================================================================
  504. sub directionalSrcSearch{
  505. my $self=shift;
  506. my $Links=shift;
  507. my $MinScore=shift;
  508. my $competitive=shift;
  509. my @LinkCluster;
  510. my ($x,$y);
  511. my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
  512. foreach (0..$#SrcLinks){
  513. if ((defined $SrcLinks[$_]) and
  514. ($SrcLinks[$_] > 0)){
  515. $self->add2LinkCluster($_,$SrcLinks[$_],\@LinkCluster);
  516. }
  517. }
  518. $self->getClusterLinks(\@LinkCluster,$Links);
  519. }
  520. #===========================================================================
  521. # ------------------ directional alignment (target to source ) ---------------
  522. #===========================================================================
  523. sub directionalTrgSearch{
  524. my $self=shift;
  525. my $Links=shift;
  526. my $MinScore=shift;
  527. my $competitive=shift;
  528. my @LinkCluster;
  529. my ($x,$y);
  530. my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
  531. foreach (0..$#TrgLinks){
  532. if ((defined $TrgLinks[$_]) and
  533. ($TrgLinks[$_] > 0)){
  534. $self->add2LinkCluster($TrgLinks[$_],$_,\@LinkCluster);
  535. }
  536. }
  537. $self->getClusterLinks(\@LinkCluster,$Links);
  538. }
  539. #===========================================================================
  540. # competitive linking
  541. # 1) get best word-to-word link (s,t)
  542. # 2) remove alternative links for (s) and for (t)
  543. # 3) go to 1) until finished
  544. #===========================================================================
  545. sub competitiveSearch{
  546. my $self=shift;
  547. my $Links=shift;
  548. my $MinScore=shift;
  549. if (not defined $MinScore){
  550. $MinScore=0.00000000000001;
  551. }
  552. my $Token=$self->{token};
  553. my $NrSrc=$#{$$Token{source}};
  554. my $NrTrg=$#{$$Token{target}};
  555. my @WordLinks=();
  556. if ($NrTrg>$NrSrc){
  557. return $self->directionalTrgSearch($Links,$MinScore,1);
  558. }
  559. return $self->directionalSrcSearch($Links,$MinScore,1);
  560. }
  561. #===========================================================================
  562. # refined symmetric link search a la Och&Ney
  563. #
  564. #===========================================================================
  565. sub bidirectionalRefinedSearchOch{
  566. my $self=shift;
  567. my $Links=shift;
  568. my $MinScore=shift;
  569. my $competitive=shift;
  570. if (not defined $MinScore){
  571. $MinScore=0.00000000000001;
  572. }
  573. my $LinkProb=$self->{matrix};
  574. my @LinkCluster;
  575. my %WordLinks=();
  576. my %InvWordLinks=();
  577. my ($x,$y);
  578. #-----------------------------------
  579. # 1) get directional links
  580. my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
  581. my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
  582. #-----------------------------------
  583. # 2) intersection of directional links
  584. foreach (0..$#SrcLinks){
  585. if ((defined $SrcLinks[$_]) and
  586. ($TrgLinks[$SrcLinks[$_]] eq $_)){
  587. $WordLinks{$_}{$SrcLinks[$_]}=1;
  588. $InvWordLinks{$SrcLinks[$_]}{$_}=1;
  589. # print STDERR "$_ --> $SrcLinks[$_]\n";
  590. }
  591. }
  592. #-----------------------------------
  593. # 3) add overlapping links
  594. # * sort all scores in the matrix
  595. # * run through possible links starting with the highest score
  596. # * repeat until no more links can be added
  597. #
  598. # links (s,t) are added if
  599. # * there is no other link for both, s AND t
  600. # * or ..the new link is adjacent to another link in source OR target
  601. # and thew new link does not create links which have neighbors
  602. # in both directions
  603. my %scores=();
  604. foreach my $s (0..$#{$LinkProb}){
  605. foreach my $t (0..$#{$$LinkProb[$s]}){ # put all scores
  606. $scores{"$s:$t"}=$$LinkProb[$s][$t]; # in a long list
  607. }
  608. }
  609. my $added=0;
  610. do{
  611. $added=0;
  612. foreach my $pair (sort {$scores{$b} <=> $scores{$a} } keys %scores){
  613. if ($scores{$pair}<$MinScore){last;}
  614. my ($s,$t)=split(/\:/,$pair);
  615. if (((not defined $WordLinks{$s}) or # if no other links
  616. (not keys %{$WordLinks{$s}})) and # are defined for both,
  617. ((not defined $InvWordLinks{$t}) or # source AND target
  618. (not keys %{$InvWordLinks{$t}}))){ # word:
  619. $added++;
  620. $scores{$pair}=0; # add the link
  621. $WordLinks{$s}{$t}=1;
  622. $InvWordLinks{$t}{$s}=1;
  623. # print STDERR "add $s --> $t (new)\n";
  624. }
  625. elsif ((($s>0) and
  626. (defined $WordLinks{$s-1}{$t})) or # the link has a
  627. (defined $WordLinks{$s+1}{$t}) or # vertical neighbor
  628. (($t>0) and
  629. (defined $WordLinks{$s}{$t-1})) or # or a
  630. (defined $WordLinks{$s}{$t+1})){ # horizontal neighbor
  631. $InvWordLinks{$t}{$s}=1;
  632. $WordLinks{$s}{$t}=1; # if there are
  633. if (&CheckWordLinks(\%WordLinks, # no links with
  634. \%InvWordLinks)){ # neighbors in both
  635. $added++; # dimensions! -->
  636. $scores{$pair}=0; # add the new link
  637. # print STDERR "add $s --> $t (adj)\n";
  638. }
  639. else{ # else:
  640. delete $WordLinks{$s}{$t}; # delete the link
  641. delete $InvWordLinks{$t}{$s};
  642. # print STDERR "reject $s --> $t\n";
  643. }
  644. }
  645. }
  646. }
  647. until (not $added); # repeat as long as links are added!
  648. $self->setParameter('adjacent_only',0);
  649. $self->setParameter('adjacent_score',0);
  650. foreach my $s (keys %WordLinks){ # put word-to-word
  651. foreach my $t (keys %{$WordLinks{$s}}){ # links together
  652. $self->add2LinkCluster($s,$t,\@LinkCluster); # (link clusters)
  653. }
  654. }
  655. #-----------------------------------
  656. # 4) convert link cluster to word/phrase links
  657. $self->getClusterLinks(\@LinkCluster,$Links);
  658. }
  659. #-------------------------------------------------------------------------
  660. # check if there are alignments containing horicontal AND vertical links
  661. # (---> return 0 if there are such links!)
  662. sub CheckWordLinks{
  663. my $srclinks=shift;
  664. my $trglinks=shift;
  665. foreach my $s (keys %{$srclinks}){
  666. foreach my $t (keys %{$$srclinks{$s}}){
  667. if (keys %{$$srclinks{$s}} > 1){
  668. if (keys %{$$trglinks{$t}} > 1){
  669. return 0;
  670. }
  671. }
  672. }
  673. }
  674. return 1;
  675. }
  676. #===========================================================================
  677. # symmetric alignment (bi-directional)
  678. # 1) get links in both directions
  679. # 2) get intersection of links
  680. # 3) iteratively add new links to existing link clusters
  681. #===========================================================================
  682. sub bidirectionalRefinedSearch{
  683. my $self=shift;
  684. my $Links=shift;
  685. my $MinScore=shift;
  686. my $competitive=shift;
  687. if (not defined $MinScore){
  688. $MinScore=0.00000000000001;
  689. }
  690. my $LinkProb=$self->{matrix};
  691. my @LinkCluster;
  692. my ($x,$y);
  693. #-----------------------------------
  694. # 1) get directional links
  695. my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
  696. my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
  697. #-----------------------------------
  698. # 2) intersection of directional links
  699. foreach (0..$#SrcLinks){
  700. if ((defined $SrcLinks[$_]) and
  701. ($TrgLinks[$SrcLinks[$_]] eq $_)){
  702. $self->add2LinkCluster($_,$SrcLinks[$_],
  703. \@LinkCluster); # (link clusters)
  704. }
  705. }
  706. #-----------------------------------
  707. # 3) add overlapping links
  708. # * sort all scores in the matrix
  709. # * run through possible links starting with the highest score
  710. # * repeat until no more links can be added
  711. #
  712. # links (s,t) are added if
  713. # * there is no other link for both, s AND t
  714. # * or ..the new link is adjacent to another link in source OR target
  715. # and thew new link does not create links which have neighbors
  716. # in both directions
  717. my %scores=();
  718. foreach my $s (0..$#{$LinkProb}){
  719. foreach my $t (0..$#{$$LinkProb[$s]}){ # put all scores
  720. $scores{"$s:$t"}=$$LinkProb[$s][$t]; # in a long list
  721. }
  722. }
  723. my $added=0;
  724. do{
  725. $added=0;
  726. foreach my $pair (sort {$scores{$b} <=> $scores{$a} } keys %scores){
  727. if ($scores{$pair}<$MinScore){last;}
  728. my ($s,$t)=split(/\:/,$pair);
  729. if ($self->add2LinkCluster($s,$t,\@LinkCluster)){
  730. $added++;
  731. delete $scores{$pair};
  732. }
  733. }
  734. }
  735. until (not $added); # repeat as long as links are added!
  736. #-----------------------------------
  737. # 4) convert link cluster to word/phrase links
  738. $self->getClusterLinks(\@LinkCluster,$Links);
  739. }
  740. # ------------------ bi-directional alignment (union) ------------------
  741. #
  742. # union of links in both diretions
  743. #
  744. sub bidirectionalUnion{
  745. my $self=shift;
  746. my $Links=shift;
  747. my $MinScore=shift;
  748. my $competitive=shift;
  749. my @LinkCluster;
  750. my ($x,$y);
  751. my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
  752. foreach (0..$#SrcLinks){
  753. if (defined $SrcLinks[$_]){
  754. $self->add2LinkCluster($_,$SrcLinks[$_],\@LinkCluster);
  755. }
  756. }
  757. my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
  758. foreach (0..$#TrgLinks){
  759. if (defined $TrgLinks[$_]){
  760. $self->add2LinkCluster($TrgLinks[$_],$_,\@LinkCluster);
  761. }
  762. }
  763. $self->getClusterLinks(\@LinkCluster,$Links);
  764. }
  765. # ------------------ bi-directional alignment (intersection) -------------
  766. #
  767. # intersection of links in both directions
  768. #
  769. sub bidirectionalIntersection{
  770. my $self=shift;
  771. my $Links=shift;
  772. my $MinScore=shift;
  773. my $competitive=shift;
  774. my @LinkCluster;
  775. my ($x,$y);
  776. my @SrcLinks=$self->bestSrcLinks($MinScore,$competitive);
  777. my @TrgLinks=$self->bestTrgLinks($MinScore,$competitive);
  778. foreach (0..$#SrcLinks){
  779. if ((defined $SrcLinks[$_]) and
  780. ($TrgLinks[$SrcLinks[$_]] eq $_)){
  781. $self->add2LinkCluster($_,$SrcLinks[$_],\@LinkCluster);
  782. $SrcLinks[$_]=undef;
  783. $TrgLinks[$SrcLinks[$_]]=undef;
  784. }
  785. }
  786. $self->getClusterLinks(\@LinkCluster,$Links);
  787. }
  788. # ------------------------------------
  789. # get best links from source to target words
  790. sub bestSrcLinks{
  791. my $self=shift;
  792. my $MinScore=shift; # score threshold
  793. my $competitive=shift; # enable/disable competive linking
  794. if ($competitive){
  795. return $self->competitiveSrcLinks($MinScore,@_);
  796. }
  797. my $LinkProb=$self->{matrix};
  798. my $Token=$self->{token};
  799. my $NrSrc=$#{$$Token{source}};
  800. my $NrTrg=$#{$$Token{target}};
  801. my @Links=();
  802. # ----------------------------
  803. my @LinkMatrix=();
  804. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
  805. # ----------------------------
  806. foreach my $s (0..$NrSrc){
  807. my $bestLink=0;
  808. my $bestScore=$LinkMatrix[$s][$bestLink];
  809. foreach my $t (1..$NrTrg){
  810. if ($LinkMatrix[$s][$t]>$bestScore){
  811. $bestLink=$t;
  812. $bestScore=$LinkMatrix[$s][$bestLink];
  813. }
  814. }
  815. if ($LinkMatrix[$s][$bestLink]<$MinScore){next;}
  816. # if ($LinkMatrix[$s][$bestLink]<$MinScore){last;}
  817. $Links[$s]=$bestLink;
  818. }
  819. return @Links;
  820. }
  821. # ------------------------------------
  822. # get best links from target to source words
  823. sub bestTrgLinks{
  824. my $self=shift;
  825. my $MinScore=shift; # score threshold
  826. my $competitive=shift; # enable/disable competive linking
  827. if ($competitive){
  828. return $self->competitiveTrgLinks($MinScore,@_);
  829. }
  830. my $LinkProb=$self->{matrix};
  831. my $Token=$self->{token};
  832. my $NrSrc=$#{$$Token{source}};
  833. my $NrTrg=$#{$$Token{target}};
  834. my @Links=();
  835. # ----------------------------
  836. my @LinkMatrix=();
  837. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
  838. # ----------------------------
  839. foreach my $t (0..$NrTrg){
  840. my $bestLink=0;
  841. my $bestScore=$LinkMatrix[$bestLink][$t];
  842. foreach my $s (1..$NrSrc){
  843. if ($LinkMatrix[$s][$t]>$bestScore){
  844. $bestLink=$s;
  845. $bestScore=$LinkMatrix[$bestLink][$t];
  846. }
  847. }
  848. if ($LinkMatrix[$bestLink][$t]<$MinScore){next;}
  849. # if ($LinkMatrix[$bestLink][$t]<$MinScore){last;}
  850. $Links[$t]=$bestLink;
  851. }
  852. return @Links;
  853. }
  854. # ------------------------------------
  855. # competitive linking from source to target
  856. sub competitiveSrcLinks{
  857. my $self=shift;
  858. my $MinScore=shift; # score threshold
  859. my $LinkProb=$self->{matrix};
  860. my $Token=$self->{token};
  861. my $NrSrc=$#{$$Token{source}};
  862. my $NrTrg=$#{$$Token{target}};
  863. my @Links=();
  864. # ----------------------------
  865. my @LinkMatrix=();
  866. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
  867. # ----------------------------
  868. my ($s,$t);
  869. while (($s,$t)=$self->getTopLink(\@LinkMatrix,$MinScore)){
  870. if ($LinkMatrix[$s][$t]<$MinScore){next;}
  871. $LinkMatrix[$s][$t]=0;
  872. $Links[$s]=$t;
  873. foreach my $x (0..$NrSrc){$LinkMatrix[$x][$t]=0;}
  874. foreach my $x (0..$NrTrg){$LinkMatrix[$s][$x]=0;}
  875. }
  876. return @Links;
  877. }
  878. # ------------------------------------
  879. # competitive linking from target to source
  880. sub competitiveTrgLinks{
  881. my $self=shift;
  882. my $MinScore=shift; # score threshold
  883. my $LinkProb=$self->{matrix};
  884. my $Token=$self->{token};
  885. my $NrSrc=$#{$$Token{source}};
  886. my $NrTrg=$#{$$Token{target}};
  887. my @Links=();
  888. # ----------------------------
  889. my @LinkMatrix=();
  890. $self->cloneLinkMatrix($LinkProb,\@LinkMatrix);
  891. # ----------------------------
  892. my ($s,$t);
  893. while (($s,$t)=$self->getTopLink(\@LinkMatrix,$MinScore)){
  894. if ($LinkMatrix[$s][$t]<$MinScore){next;}
  895. $LinkMatrix[$s][$t]=0;
  896. $Links[$t]=$s;
  897. foreach my $x (0..$NrSrc){$LinkMatrix[$x][$t]=0;}
  898. foreach my $x (0..$NrTrg){$LinkMatrix[$s][$x]=0;}
  899. }
  900. return @Links;
  901. }
  902. #==========================================================================
  903. #
  904. # get the word-to-word link with the highest score from the clue matrix
  905. #
  906. #==========================================================================
  907. sub getTopLink{
  908. my $self=shift;
  909. my $LinkProb=shift;
  910. my $MinScore=shift;
  911. my $bestX=undef;
  912. my $bestY=undef;
  913. my $bestVal;
  914. if (not ref($self->{SORTEDLINKS})){
  915. $self->sortLinks($LinkProb,$MinScore);
  916. }
  917. my $top=shift @{$self->{SORTEDLINKS}};
  918. if (not defined $top){
  919. delete $self->{SORTEDLINKS};
  920. }
  921. my @link=split (':',$top);
  922. return @link;
  923. }
  924. sub sortLinks{
  925. my $self=shift;
  926. my $LinkProb=shift;
  927. my $MinScore=shift;
  928. $self->{ALLLINKS}={};
  929. foreach my $x (0..$#{$LinkProb}){
  930. foreach my $y (0..$#{$$LinkProb[$x]}){
  931. if ($$LinkProb[$x][$y]<$MinScore){next;}
  932. if ($$LinkProb[$x][$y]<=0){next;}
  933. $self->{ALLLINKS}->{"$x:$y"}=$$LinkProb[$x][$y];
  934. }
  935. }
  936. @{$self->{SORTEDLINKS}}=
  937. sort {$self->{ALLLINKS}->{$b} <=> $self->{ALLLINKS}->{$a}}
  938. keys %{$self->{ALLLINKS}};
  939. }
  940. sub getTopLinkOld{
  941. my $self=shift;
  942. my $LinkProb=shift;
  943. my $bestX=undef;
  944. my $bestY=undef;
  945. my $bestVal;
  946. foreach my $x (0..$#{$LinkProb}){
  947. my @sort = sort {$$LinkProb[$x][$b] <=> $$LinkProb[$x][$a]}
  948. (0..$#{$$LinkProb[$x]});
  949. if ($$LinkProb[$x][$sort[0]]>$bestVal){
  950. $bestVal=$$LinkProb[$x][$sort[0]];
  951. $bestX="$x";
  952. $bestY="$sort[0]";
  953. }
  954. }
  955. if ((defined $bestX) and (defined $bestY)){
  956. return ($bestX,$bestY);
  957. }
  958. else{
  959. return ();
  960. }
  961. }
  962. #==========================================================================
  963. #
  964. # getClusterLinks:
  965. # make word/phrase links out of link clusters
  966. # (add all necessary information for storing links,
  967. # e.g. token pairs, id's, byte spans)
  968. #
  969. #==========================================================================
  970. sub getClusterLinks{
  971. my $self=shift;
  972. my $LinkCluster=shift;
  973. my $Links=shift;
  974. my $LinkProb=$self->{matrix};
  975. my $TokenAttr=$self->{tokenAttr};
  976. if (ref($Links) ne 'HASH'){$Links={};}
  977. foreach (0..$#{$LinkCluster}){
  978. if (keys %{$$LinkCluster[$_]{src}}){
  979. if (keys %{$$LinkCluster[$_]{trg}}){
  980. my $src=join ':',sort {$a<=>$b} keys %{$$LinkCluster[$_]{src}};
  981. my $trg=join ':',sort {$a<=>$b} keys %{$$LinkCluster[$_]{trg}};
  982. my $score=$self->getMatrixScore($LinkProb,
  983. $$LinkCluster[$_]{src},
  984. $$LinkCluster[$_]{trg});
  985. my $link=$self->getLinkString($TokenAttr,$src,$trg);
  986. $$Links{$src}{link}=$link;
  987. $$Links{$src}{source}=
  988. $self->ngramIDs($src,$TokenAttr,'source');
  989. $$Links{$src}{target}=
  990. $self->ngramIDs($trg,$TokenAttr,'target');
  991. # my $span=$self->ngramSpans($src,$TokenAttr,'source');
  992. # if ($span){$$Links{$src}{src}=$span;}
  993. # $span=$self->ngramSpans($trg,$TokenAttr,'target');
  994. # if ($span){$$Links{$src}{trg}=$span;}
  995. $$Links{$src}{score}=$score;
  996. }
  997. }
  998. }
  999. return $Links;
  1000. }
  1001. sub getMatrixScore{
  1002. my $self=shift;
  1003. my ($matrix,$src,$trg)=@_;
  1004. my $score=0;
  1005. my $count;
  1006. foreach my $s (keys %{$src}){
  1007. foreach my $t (keys %{$trg}){
  1008. if ($$matrix[$s][$t]>0){
  1009. $score+=log($$matrix[$s][$t]);
  1010. $count++;
  1011. }
  1012. }
  1013. }
  1014. if ($count){
  1015. $score/=$count;
  1016. }
  1017. return exp($score);
  1018. }
  1019. #==========================================================================
  1020. #
  1021. # add links to link clusters
  1022. #
  1023. #==========================================================================
  1024. sub add2LinkCluster{
  1025. my $self=shift;
  1026. my ($x,$y,$cluster)=@_;
  1027. my @overlap=$self->findClusterOverlap($x,$y,$cluster);
  1028. if ((not $self->parameter('allow_multiple_overlaps')) and (@overlap>1)){
  1029. # print STDERR "disregard $x - $y (multi-overlap)!\n";
  1030. return 0;
  1031. }
  1032. elsif (@overlap){
  1033. if ($self->parameter('in_phrases_only')){
  1034. if ($self->parameter('fill_phrases')){
  1035. if (not $self->fillPhrases($x,$y,$cluster,$overlap[0])){
  1036. # print STDERR "disregard $x - $y (fill phrase)!\n";
  1037. return 0;
  1038. }
  1039. }
  1040. if (not $self->isInPhrase($x,$y,$$cluster[$overlap[0]])){
  1041. # print STDERR "disregard $x - $y (not in phrase)!\n";
  1042. return 0;
  1043. }
  1044. }
  1045. if ($self->parameter('adjacent_only')){
  1046. if (not $self->isAdjacent($x,$y,$$cluster[$overlap[0]])){
  1047. # print STDERR "disregard $x - $y (not adjacent)!\n";
  1048. return 0;
  1049. }
  1050. }
  1051. if ($self->parameter('adjacent_score')){
  1052. if (not $self->isAdjacentScore($x,$y,$$cluster[$overlap[0]],
  1053. $self->parameter('adjacent_score'))){
  1054. #s print STDERR "disregard $x - $y (score difference to adjacent too big)!\n";
  1055. return 0;
  1056. }
  1057. }
  1058. $$cluster[$overlap[0]]{src}{$x}=1;
  1059. $$cluster[$overlap[0]]{trg}{$y}=1;
  1060. if (@overlap>1){ # join all overlapping
  1061. foreach my $o (1..$#overlap){ # link clusters!
  1062. foreach (keys %{$$cluster[$overlap[$o]]{src}}){
  1063. delete $$cluster[$overlap[$o]]{src}{$_};
  1064. $$cluster[$overlap[0]]{src}{$_}=1;
  1065. }
  1066. foreach (keys %{$$cluster[$overlap[$o]]{trg}}){
  1067. delete $$cluster[$overlap[$o]]{trg}{$_};
  1068. $$cluster[$overlap[0]]{trg}{$_}=1;
  1069. }
  1070. }
  1071. }
  1072. }
  1073. else{
  1074. $$cluster[$#{$cluster}+1]={};
  1075. $$cluster[-1]{src}{$x}=1;
  1076. $$cluster[-1]{trg}{$y}=1;
  1077. }
  1078. return 1;
  1079. }
  1080. sub isInPhrase{
  1081. my $self=shift;
  1082. my ($newX,$newY,$cluster)=@_;
  1083. my @srcAccepted=keys %{$self->{srcToken}};
  1084. my @trgAccepted=keys %{$self->{trgToken}};
  1085. my %src=%{$cluster->{src}};
  1086. my %trg=%{$cluster->{trg}};
  1087. $src{$newX}=1;
  1088. $trg{$newY}=1;
  1089. # my $srcPhr=join ':',sort {$a <=> $b} keys %src;
  1090. # my $trgPhr=join ':',sort {$a <=> $b} keys %trg;
  1091. my $srcPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %src;
  1092. my $trgPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %trg;
  1093. if (grep(/$srcPhr/,@srcAccepted)){
  1094. if (grep(/$trgPhr/,@trgAccepted)){
  1095. # my @missing=$self->getMissingTokens(\%src,\%trg);
  1096. return 1;
  1097. }
  1098. }
  1099. return 0;
  1100. }
  1101. sub fillPhrases{
  1102. my $self=shift;
  1103. my ($newX,$newY,$cluster,$nr)=@_;
  1104. my %link=();
  1105. %{$link{src}}=%{$cluster->[$nr]->{src}};
  1106. %{$link{trg}}=%{$cluster->[$nr]->{trg}};
  1107. $link{src}{$newX}=1;
  1108. $link{trg}{$newY}=1;
  1109. my @missing=$self->getMissingTokens($link{src},$link{trg});
  1110. if (not @missing){
  1111. return 0;
  1112. }
  1113. my @missSrc=split(/:/,$missing[0]);
  1114. my @missTrg=split(/:/,$missing[1]);
  1115. my %overlap=();
  1116. foreach my $s (@missSrc){
  1117. $self->findSrcOverlap($s,$cluster,\%overlap);
  1118. $link{src}{$s}=1;
  1119. }
  1120. foreach my $t (@missTrg){
  1121. $self->findTrgOverlap($t,$cluster,\%overlap);
  1122. $link{trg}{$t}=1;
  1123. }
  1124. foreach (keys %overlap){
  1125. if (not $self->isIncluded($cluster->[$_],\%link)){
  1126. foreach (@missSrc){delete $link{src}{$_};}
  1127. foreach (@missTrg){delete $link{trg}{$_};}
  1128. return 0;
  1129. }
  1130. ############# !!!!!!!!!!!!!! change this:
  1131. print STDERR "delete cluster $_!\n";
  1132. $cluster->[$_]->{src}=();
  1133. $cluster->[$_]->{trg}=();
  1134. ############# !!!!!!!!!!!!!! change this:
  1135. }
  1136. if (@missSrc or @missTrg){ # ... just for information
  1137. print STDERR "fill cluster $nr with missing tokens!\n";
  1138. }
  1139. foreach (keys %{$link{src}}){
  1140. $cluster->[$nr]->{src}->{$_}=1;
  1141. }
  1142. foreach (keys %{$link{trg}}){
  1143. $cluster->[$nr]->{trg}->{$_}=1;
  1144. }
  1145. return 1;
  1146. }
  1147. #sub removeClusterInclusions{
  1148. # my $self=shift;
  1149. # my $cluster=shift;
  1150. # foreach my $c (@{$cluster}){
  1151. # my $src=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$$cluster[$c]{src}};
  1152. # my $trg=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$$cluster[$c]{trg}};
  1153. # }
  1154. #}
  1155. sub isIncluded{
  1156. my $self=shift;
  1157. my ($cluster1,$cluster2)=@_;
  1158. foreach (keys %{$cluster1->{src}}){
  1159. if (not defined $cluster2->{src}->{$_}){return 0;}
  1160. }
  1161. foreach (keys %{$cluster1->{trg}}){
  1162. if (not defined $cluster2->{trg}->{$_}){return 0;}
  1163. }
  1164. return 1;
  1165. }
  1166. sub findSrcOverlap{
  1167. my $self=shift;
  1168. return $self->findOverlap('src',@_);
  1169. }
  1170. sub findTrgOverlap{
  1171. my $self=shift;
  1172. return $self->findOverlap('trg',@_);
  1173. }
  1174. sub findOverlap{
  1175. my $self=shift;
  1176. my ($lang,$token,$cluster,$overlap)=@_;
  1177. my @c=grep (defined $$cluster[$_]{$lang}{$token},0..$#{$cluster});
  1178. foreach (@c){
  1179. $$overlap{$_}=1;
  1180. }
  1181. }
  1182. sub getMissingTokens{
  1183. my $self=shift;
  1184. my ($src,$trg)=@_;
  1185. my @srcAccepted=keys %{$self->{srcToken}};
  1186. my @trgAccepted=keys %{$self->{trgToken}};
  1187. my $srcPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$src};
  1188. my $trgPhr=join '(:[0-9]+)?:',sort {$a <=> $b} keys %{$trg};
  1189. my $missingSrc=undef;
  1190. my $missingTrg=undef;
  1191. my @match;
  1192. if (@match=grep(/$srcPhr/,@srcAccepted)){
  1193. @match=sort {length($a) <=> length($b)} @match;
  1194. if ($match[0]=~/^(.*)$srcPhr(.*)$/){
  1195. $missingSrc="$1$2$3$4$5$6$7$8$9";
  1196. }
  1197. if (@match=grep(/$trgPhr/,@trgAccepted)){
  1198. @match=sort {length($a) <=> length($b)} @match;
  1199. if ($match[0]=~/^(.*)$trgPhr(.*)$/){
  1200. $missingTrg="$1$2$3$4$5$6$7$8$9";
  1201. }
  1202. $missingSrc=~s/^://;$missingSrc=~s/:$//;
  1203. $missingTrg=~s/^://;$missingTrg=~s/:$//;
  1204. return ($missingSrc,$missingTrg);
  1205. }
  1206. }
  1207. return ();
  1208. }
  1209. sub isAdjacent{
  1210. my $self=shift;
  1211. my ($x,$y,$cluster)=@_;
  1212. if ((defined $$cluster{src}{$x}) and
  1213. ((defined $$cluster{trg}{$y-1}) or
  1214. ((defined $$cluster{trg}{$y+1})))){
  1215. return 1;
  1216. }
  1217. if ((defined $$cluster{trg}{$y}) and
  1218. ((defined $$cluster{src}{$x-1}) or
  1219. ((defined $$cluster{src}{$x+1})))){
  1220. return 1;
  1221. }
  1222. return 0;
  1223. }
  1224. sub isAdjacentScore{
  1225. my $self=shift;
  1226. my ($x,$y,$cluster,$p)=@_;
  1227. if ((defined $$cluster{src}{$x}) and
  1228. (defined $$cluster{trg}{$y-1})){
  1229. if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x]->[$y-1]*$p){
  1230. return 1;
  1231. }
  1232. return 0;
  1233. }
  1234. if ((defined $$cluster{src}{$x}) and
  1235. (defined $$cluster{trg}{$y+1})){
  1236. if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x]->[$y+1]*$p){
  1237. return 1;
  1238. }
  1239. return 0;
  1240. }
  1241. if ((defined $$cluster{src}{$x-1}) and
  1242. (defined $$cluster{trg}{$y})){
  1243. if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x-1]->[$y]*$p){
  1244. return 1;
  1245. }
  1246. return 0;
  1247. }
  1248. if ((defined $$cluster{src}{$x+1}) and
  1249. (defined $$cluster{trg}{$y})){
  1250. if ($self->{matrix}->[$x]->[$y]>=$self->{matrix}->[$x+1]->[$y]*$p){
  1251. return 1;
  1252. }
  1253. return 0;
  1254. }
  1255. return 0;
  1256. }
  1257. sub findClusterOverlap{
  1258. my $self=shift;
  1259. my ($x,$y,$cluster)=@_;
  1260. my @overlap=();
  1261. foreach (0..$#{$cluster}){
  1262. if (defined $$cluster[$_]{src}{$x}){
  1263. push(@overlap,$_);
  1264. }
  1265. elsif (defined $$cluster[$_]{trg}{$y}){
  1266. push(@overlap,$_);
  1267. }
  1268. }
  1269. return @overlap;
  1270. }
  1271. #========================================================================
  1272. sub cloneLinkMatrix{
  1273. my $self=shift;
  1274. my $matrix=shift;
  1275. my $clone=shift;
  1276. if (ref($matrix) ne 'ARRAY'){return ();}
  1277. if (ref($clone) ne 'ARRAY'){$clone=[];}
  1278. foreach my $x (0..$#{$matrix}){
  1279. foreach my $y (0..$#{$$matrix[$x]}){
  1280. $$clone[$x][$y]=$$matrix[$x][$y];
  1281. }
  1282. }
  1283. return $clone;
  1284. }
  1285. #==========================================================================
  1286. #
  1287. #
  1288. #
  1289. #==========================================================================
  1290. sub clueMatrixToHtml{
  1291. my $self=shift;
  1292. my $Matrix=$self->{matrix};
  1293. my $Token=$self->{token};
  1294. my $SrcTok=$$Token{source};
  1295. my $TrgTok=$$Token{target};
  1296. my $nrSrc=$#{$$Token{source}};
  1297. my $nrTrg=$#{$$Token{target}};
  1298. my $max;
  1299. foreach my $s (0..$nrSrc){
  1300. foreach my $t (0..$nrTrg){
  1301. if ($Matrix->[$s]->[$t]>$max){$max=$Matrix->[$s]->[$t];}
  1302. }
  1303. }
  1304. if (not $max){$max=1;}
  1305. my $html="<p>\n";
  1306. $html.="<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n";
  1307. $html.="<tr><th></th>\n";
  1308. foreach my $t (0..$nrTrg){
  1309. my $str=$TrgTok->[$t];
  1310. $html.="<th>$str</th>\n";
  1311. }
  1312. foreach my $s (0..$nrSrc){
  1313. $html.="</tr><tr>\n";
  1314. my $str=$SrcTok->[$s];
  1315. $html.="<th>$str</th>\n";
  1316. foreach my $t (0..$nrTrg){
  1317. my $score=0;
  1318. if ($Matrix->[$s]){
  1319. if ($Matrix->[$s]->[$t]){
  1320. $score=$Matrix->[$s]->[$t];
  1321. }
  1322. }
  1323. my $color=255-$score*256/$max;
  1324. if ($color==-1){$color=0;}
  1325. my $hex=sprintf("%X",$color);
  1326. if (length($hex)<2){$hex="0$hex";}
  1327. my $val=int(100*$score);
  1328. if ($color<128){
  1329. $html.="<td bgcolor=\"#$hex$hex$hex\">";
  1330. $html.='<font color="#ffffff">';
  1331. $html.="$val</font></td>\n";
  1332. }
  1333. else{
  1334. $html.="<td bgcolor=\"#$hex$hex$hex\">";
  1335. $html.="$val</td>\n";
  1336. }
  1337. }
  1338. }
  1339. $html.="</tr></table><hr>\n";
  1340. return $html;
  1341. }
  1342. sub printHtmlClueMatrix{
  1343. my $self=shift;
  1344. print STDERR $self->clueMatrixToHtml();
  1345. }
  1346. sub printClueMatrix{
  1347. my $self=shift;
  1348. my ($SrcTok,$TrgTok,$Matrix)=@_;
  1349. my $nrSrc=$#{$SrcTok};
  1350. my $nrTrg=$#{$TrgTok};
  1351. print STDERR "\n=====================================================\n";
  1352. print STDERR "final clue matrix scores";
  1353. print STDERR "\n=====================================================\n";
  1354. foreach my $s (0..$nrSrc){
  1355. foreach my $t (0..$nrTrg){
  1356. my $score=$Matrix->[$s]->[$t];
  1357. if ($score>0){
  1358. # printf STDERR "[%2d-%-2d] %15s - %-15s: %s\n",
  1359. printf STDERR "[%d %d] %20s - %-20s %s\n",
  1360. $s,$t,$$SrcTok[$s],$$TrgTok[$t],$score;
  1361. }
  1362. }
  1363. }
  1364. print STDERR "\n=====================================================\n";
  1365. print STDERR "clue matrix $nrSrc x $nrTrg";
  1366. print STDERR "\n=====================================================\n";
  1367. my @char=();
  1368. &MakeCharArr($TrgTok,\@char);
  1369. foreach my $c (0..$#char){
  1370. printf STDERR "\n%10s",' ';
  1371. foreach (@{$char[$c]}){
  1372. printf STDERR "%4s",$_;
  1373. }
  1374. }
  1375. print STDERR "\n";
  1376. foreach my $s (0..$nrSrc){
  1377. my $str=substr($SrcTok->[$s],0,10);
  1378. $str=&Uplug::Encoding::convert($str,'utf-8','iso-8859-1');
  1379. printf STDERR "%10s",$str;
  1380. foreach my $t (0..$nrTrg){
  1381. my $score=0;
  1382. if ($Matrix->[$s]){
  1383. if ($Matrix->[$s]->[$t]){
  1384. $score=$Matrix->[$s]->[$t];
  1385. }
  1386. }
  1387. printf STDERR " %3d",$score*100;
  1388. }
  1389. print STDERR "\n";
  1390. }
  1391. }
  1392. sub MakeCharArr{
  1393. my ($tok,$char)=@_;
  1394. my @lat1=@{$tok};
  1395. # my @lat1=();
  1396. # foreach (0..$#{$tok}){
  1397. # $lat1[$_]=&Uplug::Data::encode($tok->[$_],'utf-8','iso-8859-1');
  1398. # }
  1399. map ($lat1[$_]=&Uplug::Encoding::convert($lat1[$_],'utf-8','iso-8859-1'),
  1400. (0..$#lat1));
  1401. my $max=&MaxLength(\@lat1);
  1402. foreach my $t (0..$#{$tok}){
  1403. my @c=split(//,$lat1[$t]);
  1404. foreach (1..$max){
  1405. if (@c){
  1406. $char->[$max-$_]->[$t]=pop(@c);
  1407. # $char->[$max-$_]->[$t]=shift(@c);
  1408. }
  1409. else{$char->[$max-$_]->[$t]=' ';}
  1410. }
  1411. }
  1412. }
  1413. sub MaxLength{
  1414. my ($tok)=@_;
  1415. my $max=0;
  1416. foreach (@{$tok}){
  1417. if (length($_)>$max){$max=length($_);}
  1418. }
  1419. return $max;
  1420. }
  1421. ######### return a true value
  1422. 1;