PageRenderTime 48ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/uplug-main/lib/Uplug/Data/Align.pm

https://bitbucket.org/tiedemann/uplug
Perl | 650 lines | 492 code | 96 blank | 62 comment | 72 complexity | 16e560bbb52b4fa6696007695d59369c MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. ####################################################################
  2. # Copyright (C) 2004 Jörg Tiedemann
  3. #
  4. # This program is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation; either version 2 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program; if not, write to the Free Software
  16. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  17. #
  18. # $Author$
  19. # $Id$
  20. #
  21. ###########################################################################
  22. # Uplug::Data::Align
  23. #
  24. #
  25. #
  26. ###########################################################################
  27. package Uplug::Data::Align;
  28. use strict;
  29. use vars qw( @ISA );
  30. use Uplug::Data;
  31. use Uplug::Data::Lang;
  32. @ISA = qw( Uplug::Data );
  33. sub init{
  34. my $self=shift;
  35. my $srclang=shift;
  36. my $trglang=shift;
  37. if (not defined $self->{source}){
  38. $self->{source}=Uplug::Data::Lang->new($srclang);
  39. $self->setOption('SRCLANG',$srclang);
  40. }
  41. elsif ((defined $srclang) and ($self->{SRCLANG} ne $srclang)){
  42. $self->{source}=Uplug::Data::Lang->init($srclang);
  43. $self->setOption('SRCLANG',$srclang);
  44. }
  45. else{$self->{source}->init();}
  46. if (not defined $self->{target}){
  47. $self->{target}=Uplug::Data::Lang->new($trglang);
  48. $self->setOption('TRGLANG',$trglang);
  49. }
  50. elsif ((defined $trglang) and ($self->{TRGLANG} ne $trglang)){
  51. $self->{target}=Uplug::Data::Lang->new($trglang);
  52. $self->setOption('TRGLANG',$trglang);
  53. }
  54. else{$self->{target}->init();}
  55. # $self->{SRCSUBNEW}=1; # flag for creating new source sub-trees
  56. # $self->{TRGSUBNEW}=1; # flag for creating new target sub-trees
  57. return $self->SUPER::init(@_);
  58. }
  59. sub clone{return Uplug::Data::Align->new();}
  60. sub makeLangSubData{
  61. my $self=shift;
  62. $self->subData($self->{'source'},'source');
  63. $self->subData($self->{'target'},'target');
  64. }
  65. sub sourceData{return $_[0]->{source};}
  66. sub targetData{return $_[0]->{target};}
  67. sub linkData{return $_[0]->{link};}
  68. sub getTokens{
  69. my $self=shift;
  70. my $lang=shift;
  71. my $param=shift;
  72. if (not defined $lang){$lang='source'};
  73. if (not ref($self->{$lang})){return undef;}
  74. # $self->subData($self->{$lang},$lang);
  75. # $self->{$lang}=$self->subData($lang);
  76. $self->makeParameter($param,$lang);
  77. return $self->{$lang}->getTokens($param,@_);
  78. }
  79. sub getSrcTokens{my $self=shift;return $self->getTokens('source',@_);}
  80. sub getTrgTokens{my $self=shift;return $self->getTokens('target',@_);}
  81. sub getNgrams{
  82. my $self=shift;
  83. my $lang=shift;
  84. my $param=shift;
  85. if (not defined $lang){$lang='source'};
  86. if (not ref($self->{$lang})){return undef;}
  87. # $self->subData($self->{$lang},$lang);
  88. # $self->{$lang}=$self->subData($lang);
  89. $self->makeParameter($param,$lang);
  90. return $self->{$lang}->getNgrams($param,@_);
  91. }
  92. sub getSrcNgrams{my $self=shift;return $self->getNgrams('source',@_);}
  93. sub getTrgNgrams{my $self=shift;return $self->getNgrams('target',@_);}
  94. sub getChunks{
  95. my $self=shift;
  96. my $lang=shift;
  97. my $param=shift;
  98. if (not defined $lang){$lang='source'};
  99. if (not ref($self->{$lang})){return undef;}
  100. # $self->subData($self->{$lang},$lang);
  101. # $self->{$lang}=$self->subData($lang);
  102. $self->makeParameter($param,$lang);
  103. return $self->{$lang}->getChunks($param,@_);
  104. }
  105. sub getSrcChunks{my $self=shift;return $self->getChunks('source',@_);}
  106. sub getTrgChunks{my $self=shift;return $self->getChunks('target',@_);}
  107. sub getPhrases{
  108. my $self=shift;
  109. my $lang=shift;
  110. my $param=shift;
  111. if (not defined $lang){$lang='source'};
  112. if (not ref($self->{$lang})){return undef;}
  113. # $self->subData($self->{$lang},$lang);
  114. # $self->{$lang}=$self->subData($lang);
  115. $self->makeParameter($param,$lang);
  116. return $self->{$lang}->getPhrases($param,@_);
  117. }
  118. sub getSrcPhrases{my $self=shift;return $self->getPhrases('source',@_);}
  119. sub getTrgPhrases{my $self=shift;return $self->getPhrases('target',@_);}
  120. sub getPhrasePos{
  121. my $self=shift;
  122. my ($phraseNodes,$tokenNodes)=@_;
  123. my @idx=();
  124. foreach my $p (0..$#{$phraseNodes}){
  125. my $lastIdx=0;
  126. foreach my $t ($lastIdx..$#{$tokenNodes}){
  127. if ($$phraseNodes[$p]==$$tokenNodes[$t]){
  128. push(@idx,$t);
  129. $lastIdx=$t+1;
  130. }
  131. }
  132. }
  133. return join ":",@idx;
  134. }
  135. sub getRelativePosition{
  136. my $self=shift;
  137. my ($srcPhr,$trgPhr)=@_;
  138. my $srcPos=$self->{source}->getPhrasePosition($srcPhr);
  139. my $trgPos=$self->{target}->getPhrasePosition($trgPhr);
  140. if (not defined $srcPos){return 0;}
  141. if (not defined $trgPos){return 0;}
  142. return $trgPos-$srcPos;
  143. }
  144. sub getFeaturePairs{
  145. my $self=shift;
  146. return getAlignPhrases(@_);
  147. }
  148. sub getAlignPhrases{
  149. my $self=shift;
  150. my ($param,$src,$trg,$token,$attr)=@_;
  151. if (ref($param) ne 'HASH'){$param={};}
  152. #----------------------------------------------------------------------
  153. my @srcTokNodes=(); # 1) get all tokens
  154. my @trgTokNodes=();
  155. my @srcTok=$self->getSrcTokens($param,\@srcTokNodes);
  156. my @trgTok=$self->getTrgTokens($param,\@trgTokNodes);
  157. #----------------------------------------------------------------------
  158. my $srcNodes=[]; # 2) get all possible phrases
  159. my $trgNodes=[];
  160. my @srcPhr=$self->getSrcPhrases($param,$srcNodes,
  161. \@srcTokNodes,\@srcTok);
  162. my @trgPhr=$self->getTrgPhrases($param,$trgNodes,
  163. \@trgTokNodes,\@trgTok);
  164. #----------------------------------------------------------------------
  165. my @srcIdx=(); # 3) get token positions for each phrase
  166. my @trgIdx=();
  167. foreach (0..$#srcPhr){
  168. push (@srcIdx,$self->getPhrasePos($$srcNodes[$_],\@srcTokNodes));
  169. }
  170. foreach (0..$#trgPhr){
  171. push (@trgIdx,$self->getPhrasePos($$trgNodes[$_],\@trgTokNodes));
  172. }
  173. #----------------------------------------------------------------------
  174. $self->makeParameter($param,'source'); # get source feature
  175. foreach (0..$#{$srcNodes}){
  176. $$src{$srcIdx[$_]}=
  177. $self->{source}->getPhraseFeature(\@{$$srcNodes[$_]},
  178. $param);
  179. if (defined $$param{'relative position'}){
  180. my $srcPos=$self->{source}->getPhrasePosition($$srcNodes[$_]);
  181. if ($$src{$srcIdx[$_]}=~/\S/){
  182. $$src{$srcIdx[$_]}.=":pos($srcPos)";
  183. }
  184. else{
  185. $$src{$srcIdx[$_]}="pos($srcPos)";
  186. }
  187. }
  188. }
  189. $self->makeParameter($param,'target'); # and generate feature
  190. foreach (0..$#{$trgNodes}){
  191. $$trg{$trgIdx[$_]}=
  192. $self->{target}->getPhraseFeature(\@{$$trgNodes[$_]},
  193. $param);
  194. if (defined $$param{'relative position'}){
  195. my $trgPos=$self->{target}->getPhrasePosition($$trgNodes[$_]);
  196. if ($$trg{$trgIdx[$_]}=~/\S/){
  197. $$trg{$trgIdx[$_]}.=":pos($trgPos)";
  198. }
  199. else{
  200. $$trg{$trgIdx[$_]}="pos($trgPos)";
  201. }
  202. }
  203. }
  204. if (ref($token) eq 'HASH'){
  205. @{$$token{source}}=@srcTok;
  206. @{$$token{target}}=@trgTok;
  207. if (ref($attr) eq 'HASH'){
  208. @{$$attr{source}}=$self->{source}->attribute(\@srcTokNodes);
  209. @{$$attr{target}}=$self->{target}->attribute(\@trgTokNodes);
  210. foreach (0..$#srcTokNodes){
  211. $$attr{source}[$_]{content}=$self->content($srcTokNodes[$_]);
  212. }
  213. foreach (0..$#trgTokNodes){
  214. $$attr{target}[$_]{content}=$self->content($trgTokNodes[$_]);
  215. }
  216. }
  217. }
  218. }
  219. sub getSrcTokenFeatures{
  220. my $self=shift;
  221. return $self->getTokenFeatures('source',@_);
  222. }
  223. sub getTrgTokenFeatures{
  224. my $self=shift;
  225. return $self->getTokenFeatures('target',@_);
  226. }
  227. sub getTokenFeatures{
  228. my $self=shift;
  229. my $lang=shift; # source / target
  230. my ($param,$nodes)=@_;
  231. if (ref($param) ne 'HASH'){$param={};}
  232. if (ref($nodes) ne 'ARRAY'){$nodes=[];}
  233. if (not ref($self->{$lang})){return undef;}
  234. #----------------------------------------------------------------------
  235. $self->makeParameter($param,$lang);
  236. my @tok=$self->{$lang}->getTokens($param,$nodes);
  237. if (keys %{$param}){
  238. foreach (0..$#{$nodes}){
  239. $tok[$_]=$self->{$lang}->getPhraseFeature([$$nodes[$_]],$param);
  240. }
  241. }
  242. return @tok;
  243. }
  244. sub getBitextPhrases{
  245. my $self=shift;
  246. my ($param,$src,$trg,$token,$attr)=@_;
  247. if (ref($param) ne 'HASH'){$param={};}
  248. #----------------------------------------------------------------------
  249. my @srcTokNodes=(); # 1) get all tokens
  250. my @trgTokNodes=();
  251. my @srcTok=$self->getSrcTokens($$param{general},\@srcTokNodes);
  252. my @trgTok=$self->getTrgTokens($$param{general},\@trgTokNodes);
  253. #----------------------------------------------------------------------
  254. my $srcNodes=[]; # 2) get all possible phrases
  255. my $trgNodes=[];
  256. my @srcPhr=$self->getSrcPhrases($$param{general},$srcNodes,
  257. \@srcTokNodes,\@srcTok);
  258. my @trgPhr=$self->getTrgPhrases($$param{general},$trgNodes,
  259. \@trgTokNodes,\@trgTok);
  260. #----------------------------------------------------------------------
  261. my @srcIdx=(); # 3) get token positions for each phrase
  262. my @trgIdx=();
  263. foreach (0..$#srcPhr){
  264. push (@srcIdx,$self->getPhrasePos($$srcNodes[$_],\@srcTokNodes));
  265. }
  266. foreach (0..$#trgPhr){
  267. push (@trgIdx,$self->getPhrasePos($$trgNodes[$_],\@trgTokNodes));
  268. }
  269. #----------------------------------------------------------------------
  270. foreach my $p (keys %{$param}){ # 4) generate phrase features
  271. if ($p eq 'general'){ # a) general = phrase string
  272. foreach (0..$#srcPhr){
  273. $$src{$srcIdx[$_]}{$p}=$srcPhr[$_]; # the source phrase
  274. }
  275. foreach (0..$#trgPhr){
  276. $$trg{$trgIdx[$_]}{$p}=$trgPhr[$_]; # the target phrase
  277. }
  278. next;
  279. }
  280. my $srcParam=$$param{$p}; # b) feature parameter
  281. $self->makeParameter($srcParam,'source'); # get source feature
  282. foreach (0..$#{$srcNodes}){
  283. $$src{$srcIdx[$_]}{$p}=
  284. $self->{source}->getPhraseFeature(\@{$$srcNodes[$_]},
  285. $srcParam);
  286. if ((ref($$param{$p}) eq 'HASH') and
  287. (defined $$param{$p}{'relative position'})){
  288. my $srcPos=$self->{source}->getPhrasePosition($$srcNodes[$_]);
  289. if ($$src{$srcIdx[$_]}{$p}=~/\S/){
  290. $$src{$srcIdx[$_]}{$p}.=":pos($srcPos)";
  291. }
  292. else{
  293. $$src{$srcIdx[$_]}{$p}="pos($srcPos)";
  294. }
  295. }
  296. }
  297. my $trgParam=$$param{$p}; # get target features
  298. $self->makeParameter($trgParam,'target'); # and generate feature
  299. foreach (0..$#{$trgNodes}){
  300. $$trg{$trgIdx[$_]}{$p}=
  301. $self->{target}->getPhraseFeature(\@{$$trgNodes[$_]},
  302. $trgParam);
  303. if ((ref($$param{$p}) eq 'HASH') and
  304. (defined $$param{$p}{'relative position'})){
  305. my $trgPos=$self->{target}->getPhrasePosition($$trgNodes[$_]);
  306. if ($$trg{$trgIdx[$_]}{$p}=~/\S/){
  307. $$trg{$trgIdx[$_]}{$p}.=":pos($trgPos)";
  308. }
  309. else{
  310. $$trg{$trgIdx[$_]}{$p}="pos($trgPos)";
  311. }
  312. }
  313. }
  314. }
  315. if (ref($token) eq 'HASH'){
  316. @{$$token{source}}=@srcTok;
  317. @{$$token{target}}=@trgTok;
  318. if (ref($attr) eq 'HASH'){
  319. @{$$attr{source}}=$self->{source}->attribute(\@srcTokNodes);
  320. @{$$attr{target}}=$self->{target}->attribute(\@trgTokNodes);
  321. foreach (0..$#srcTokNodes){
  322. $$attr{source}[$_]{content}=$self->content($srcTokNodes[$_]);
  323. }
  324. foreach (0..$#trgTokNodes){
  325. $$attr{target}[$_]{content}=$self->content($trgTokNodes[$_]);
  326. }
  327. }
  328. }
  329. }
  330. sub getPhraseFeature{
  331. my $self=shift;
  332. my $lang=shift;
  333. my $nodes=shift;
  334. my $param=shift;
  335. if (not defined $lang){$lang='source'};
  336. if (not ref($self->{$lang})){return undef;}
  337. # $self->subData($self->{$lang},$lang);
  338. # $self->{$lang}=$self->subData($lang);
  339. $self->makeParameter($param,$lang);
  340. return $self->{$lang}->getPhraseFeature($nodes,$param,@_);
  341. }
  342. sub getSrcPhraseFeature{my $s=shift;return $s->getPhraseFeature('source',@_);}
  343. sub getTrgPhraseFeature{my $s=shift;return $s->getPhraseFeature('target',@_);}
  344. sub checkPairParameter{
  345. my $self=shift;
  346. my ($src,$trg,$param)=@_;
  347. if ($$param{'minimal length (source)'}){
  348. if (length($src)<$$param{'minimal length (source)'}){
  349. # print STDERR "minimale length (source)\n";
  350. return 0;
  351. }
  352. }
  353. if ($$param{'minimal length (target)'}){
  354. if (length($trg)<$$param{'minimal length (target)'}){
  355. # print STDERR "minimale length (target)\n";
  356. return 0;
  357. }
  358. }
  359. if ($$param{'minimal length diff'}){
  360. if ($self->lengthQuotient($src,$trg)<$$param{'minimal length diff'}){
  361. # print STDERR "minimale length diff\n";
  362. return 0;
  363. }
  364. }
  365. if ($$param{'matching word class'}){
  366. if (not $self->isSameType($src,$trg,$$param{'matching word class'})){
  367. # print STDERR "matching word class\n";
  368. return 0;
  369. }
  370. }
  371. if ($$param{'stop words'}){
  372. if (not $self->isSameType($src,$trg,$$param{'stop words'})){
  373. # print STDERR "stop words\n";
  374. return 0;
  375. }
  376. }
  377. return 1;
  378. }
  379. sub isSameType{
  380. my $self=shift;
  381. my ($src,$trg,$check)=@_;
  382. if (($check eq 'open/closed') or ($check eq 'same')){
  383. if ($self->{source}->isStopWord($src)){
  384. return $self->{target}->isStopWord($trg);
  385. }
  386. return (not $self->{target}->isStopWord($trg));
  387. }
  388. elsif ($check eq 'exclude'){
  389. if (not $self->{source}->isStopWord($src)){
  390. return 1;
  391. }
  392. return (not $self->{target}->isStopWord($trg));
  393. }
  394. elsif(($check eq 'same_class') or ($check eq 'wordclass')){
  395. return $self->isSameClass($src,$trg);
  396. }
  397. elsif(($check eq 'same_sub_class') or ($check eq 'subclass')){
  398. return $self->isSameSubClass($src,$trg);
  399. }
  400. return 1;
  401. }
  402. sub isSameClass{
  403. my $self=shift;
  404. my ($src,$trg)=@_;
  405. my $cat='stop word class hash';
  406. my $SrcData=$self->{source}->getLanguageData($cat);
  407. my $TrgData=$self->{target}->getLanguageData($cat);
  408. if (ref($SrcData) ne 'HASH'){return 1;}
  409. if (ref($TrgData) ne 'HASH'){return 1;}
  410. foreach my $c (%{$SrcData}){
  411. if (defined $$SrcData{$c}{$src}){
  412. if (defined $$TrgData{$c}){
  413. if (defined $$TrgData{$c}{$trg}){
  414. return 1;
  415. }
  416. }
  417. return 0;
  418. }
  419. }
  420. foreach my $c (%{$TrgData}){
  421. if (defined $$TrgData{$c}{$trg}){
  422. return 0;
  423. }
  424. }
  425. return 1;
  426. }
  427. sub isSameSubClass{
  428. my $self=shift;
  429. my ($src,$trg)=@_;
  430. my $cat='stop word subclass hash';
  431. my $SrcData=$self->{source}->getLanguageData($cat);
  432. my $TrgData=$self->{target}->getLanguageData($cat);
  433. if (ref($SrcData) ne 'HASH'){return 1;}
  434. if (ref($TrgData) ne 'HASH'){return 1;}
  435. foreach my $x (%{$SrcData}){
  436. foreach my $y (%{$$SrcData{$x}}){
  437. if (defined $$SrcData{$x}{$y}{$src}){
  438. if (defined $$TrgData{$x}){
  439. if (defined $$TrgData{$x}{$y}){
  440. if (defined $$TrgData{$x}{$y}{$trg}){
  441. return 1;
  442. }
  443. }
  444. }
  445. return 0;
  446. }
  447. }
  448. }
  449. foreach my $x (keys %{$TrgData}){
  450. foreach my $y (keys %{$$TrgData{$x}}){
  451. if (defined $$TrgData{$x}{$y}{$trg}){
  452. return 0;
  453. }
  454. }
  455. }
  456. return 1;
  457. }
  458. sub lengthQuotient{
  459. my $self=shift;
  460. my ($src,$trg)=@_;
  461. if (length($src)==0 or length($trg)==0) {return 0;}
  462. if (length($src)>length($trg)) {return length($trg)/length($src);}
  463. else {return length($src)/length($trg);}
  464. }
  465. sub makeParameter{
  466. my $self=shift;
  467. my ($param,$lang)=@_;
  468. if (ref($param) ne 'HASH'){return;}
  469. foreach (keys %{$param}){
  470. if (/^(.*) \($lang\)/){
  471. $param->{$1}=$param->{$_};
  472. }
  473. }
  474. }
  475. sub rmLinkedToken{
  476. my $data=shift;
  477. # my $srcData=Uplug::Data::Lang->new;
  478. # my $trgData=Uplug::Data::Lang->new;
  479. # $data->subData($srcData,'source');
  480. # $data->subData($trgData,'target');
  481. ## $data->{'source'}=$data->subData('source');
  482. ## $data->{'target'}=$data->subData('target');
  483. my $srcData=$data->{source};
  484. my $trgData=$data->{target};
  485. my $link=$data->{link};
  486. my @nodes=$link->findNodes('wordLink');
  487. my @xtrg=$link->attribute(\@nodes,'xtargets');
  488. foreach my $l (@xtrg){
  489. if ($l=~/^(.*\S)\s?\;\s?(\S.*)$/){
  490. my ($s,$t)=($1,$2);
  491. $data->rmToken($s,$srcData);
  492. $data->rmToken($t,$trgData);
  493. }
  494. }
  495. }
  496. sub rmToken{
  497. my $self=shift;
  498. my ($span,$data)=@_;
  499. my @token=split(/[\+\s]/,$span);
  500. foreach (@token){
  501. my ($node)=$data->findNodes('.*',{id => $_});
  502. if (defined $node){
  503. $node->getParentNode->removeChild($node);
  504. $node->dispose();
  505. }
  506. }
  507. }
  508. sub rmWordLinks{
  509. my $data=shift;
  510. # $data->{link}->delAttribute('wordLink');
  511. if (ref($data->{link})){
  512. $data->{link}->delNodes('wordLink');
  513. }
  514. }
  515. sub findLink{
  516. my $self=shift;
  517. my $link=shift;
  518. my %attr=();
  519. $attr{src}=$link->{source};
  520. $attr{trg}=$link->{target};
  521. my @nodes=$self->{link}->findNodes('wordLink',\%attr);
  522. if (@nodes){
  523. return @nodes;
  524. }
  525. return undef;
  526. }
  527. sub addWordLink{
  528. my $data=shift;
  529. my $link=shift;
  530. my $OutData=$data->{link};
  531. if (defined $data->findLink($link)){return;}
  532. my %attr=();
  533. if (defined $link->{score}){
  534. $attr{certainty}=$link->{score};
  535. }
  536. $attr{lexPair}=$link->{link};
  537. $attr{xtargets}="$link->{source};$link->{target}";
  538. $attr{xtargets}=~tr/:/+/;
  539. if ($link->{src} and $link->{trg}){
  540. $link->{src}=~tr/\&/\+/;
  541. $link->{trg}=~tr/\&/\+/;
  542. $attr{span}="$link->{src};$link->{trg}"
  543. }
  544. my $wordLink=$OutData->createNode('wordLink',\%attr);
  545. $OutData->addNode($wordLink);
  546. # if (defined $link->{step}){$attr{step}=$link->{step};}
  547. # $attr{'id'}=$id;
  548. # $attr{'content'}="\n$src:$trg\n";
  549. }
  550. sub toHTML{
  551. my $self=shift;
  552. my $html=$self->{source}->toHTML();
  553. $html.=$self->{target}->toHTML();
  554. }