PageRenderTime 49ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/uplug-main/tools/uplug-readalign

https://bitbucket.org/tiedemann/uplug
Perl | 347 lines | 262 code | 47 blank | 38 comment | 56 complexity | 6eaa14dd81db8fd6c5c3fc5c27657262 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. #!/usr/bin/perl
  2. #
  3. # Copyright (C) 2004 Jörg Tiedemann <joerg@stp.ling.uu.se>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. #
  19. # $Id: readalign,v 1.6 2009/06/10 21:58:01 joerg72 Exp $
  20. #
  21. # usage: readalign [-m max] [-h] xces-file
  22. #
  23. # -m max: show <max> number of sentence alignments
  24. # -h : print html
  25. #
  26. =head1 NAME
  27. uplug-readalign - read sentence alignment in XCES align format
  28. =head1 SYNOPSIS
  29. # read sentence alignments and print aligned sentences
  30. uplug-readalign align-file.xml
  31. # print alignments with alignment certainty > LinkThr=0
  32. uplug-readalign -c 0 align-file.xml
  33. # print alignments with max 2 source sentences and 3 target sentences
  34. uplug-readalign -S 2 -T 3 align-file.xml
  35. # print aligned sentences marked as 'de' (source) and 'en' (target)
  36. # (this only works if sentences are marked with languages:
  37. # for example, in the German XML file: <s lang="de">...</s>)
  38. uplug-readalign -s de -t en align-file.xml
  39. # wrap aligned sentences in simple HTML
  40. uplug-readalign -h align-file.xml
  41. # print max 10 alignments
  42. uplug-readalign -m 10 align-file.xml
  43. # specify home directory of aligned XML files
  44. uplug-readalign -d /path/to/xml/files align-file.xml
  45. # print XCES align format of all 1:1 sentence alignments
  46. uplug-readalign -S 1 -T 1 -l align-file.xml
  47. =head1 USAGE
  48. uplug-readalign [OPTIONS] align-file.xml
  49. =head1 OPTIONS
  50. -c <thr> ........... set a link threshold <thr>
  51. -d <dir> ........... set home directory for aligned XML documents
  52. -h ................. print simple HTML
  53. -l ................. print links (filter mode)
  54. -m <max> ........... print max <max> alignments
  55. -s <LangID> ........ require source sentences to match <LangID>
  56. -t <LangID> ........ require target sentences to match <LangID>
  57. -S <max> ........... maximum number of source sentence in alignments
  58. -T <max> ........... maximum number of target sentence in alignments
  59. =head1 DESCRIPTION
  60. C<uplug-readalign> is a simple script to read sentence alignments stored in XCES align format and prints the aligned sentences to STDOUT. It requires monolingual alignments (ascending order, no crossing links) of sentences in linked XML files. Linked XML files are specified in the C<toDoc> and <fromDoc> attributes (see below).
  61. <cesAlign version="1.0">
  62. <linkGrp targType="s" toDoc="source1.xml" fromDoc="target1.xml">
  63. <link certainty="0.88" xtargets="s1.1 s1.2;s1.1" id="SL1" />
  64. ....
  65. <linkGrp targType="s" toDoc="source2.xml" fromDoc="target2.xml">
  66. <link certainty="0.88" xtargets="s1.1;s1.1" id="SL1" />
  67. Several parameters can be set to filter the alignments and to print only certain types of alignments.
  68. C<uplug-readalign> can also be used to filter the XCES alignment files and to print the remaining links in the same XCES align format. Use the C<-l> flag to enable this mode.
  69. =head1 See also
  70. More information on Uplug: Look at L<Uplug::Config>
  71. More downloads:
  72. L<https://bitbucket.org/tiedemann/uplug>
  73. =cut
  74. use strict;
  75. use FindBin qw($Bin);
  76. my $html=0;
  77. my $max=0;
  78. my $SrcID=undef;
  79. my $TrgID=undef;
  80. my $MaxSrc=undef;
  81. my $MaxTrg=undef;
  82. my $LinkThr=undef;
  83. my $FilterMode=0; # filter-mode: print alignment XML
  84. my $SkipDocs=undef; # RE pattern for doc's to be skipped when reading
  85. my $dir='xml'; # extra directory to check for from/toDoc
  86. while ($ARGV[0]=~/^\-/){
  87. my $o=shift(@ARGV);
  88. if ($o=~/^\-h/){$html=1;}
  89. elsif ($o=~/^\-m/){$max=shift @ARGV;}
  90. elsif ($o=~/^\-d/){$dir=shift @ARGV;}
  91. elsif ($o=~/^\-s/){$SrcID=shift @ARGV;}
  92. elsif ($o=~/^\-t/){$TrgID=shift @ARGV;}
  93. elsif ($o=~/^\-S/){$MaxSrc=shift @ARGV;}
  94. elsif ($o=~/^\-T/){$MaxTrg=shift @ARGV;}
  95. elsif ($o=~/^\-c/){$LinkThr=shift @ARGV;}
  96. elsif ($o=~/^\-N/){$SkipDocs=shift @ARGV;}
  97. elsif ($o=~/^\-l/){$FilterMode=1;}
  98. }
  99. my $ALIGN=shift(@ARGV);
  100. my $srcdoc='';
  101. my $trgdoc='';
  102. if ((not -e "$ALIGN") and (-e "$ALIGN.gz")){$ALIGN="$ALIGN.gz";}
  103. if (not -e $ALIGN){die "Alignment file $ALIGN does not exist!\n";}
  104. if ($ALIGN=~/\.gz/){
  105. open F,"gzip -cd <$ALIGN |";
  106. }
  107. else{
  108. open F,"<$ALIGN";
  109. }
  110. if ($html){&PrintHtmlHeader();}
  111. my $firstSrc=1;
  112. my $firstTrg=1;
  113. my $count=0;
  114. while (<F>){
  115. if (/fromDoc=\"([^\"]+)\"/){
  116. if ($srcdoc ne $1){
  117. $srcdoc=$1;
  118. if (not $firstSrc){close SRC;}
  119. if ((not -e $srcdoc) and (-e "$srcdoc.gz")){
  120. $srcdoc="$srcdoc.gz";
  121. }
  122. if ((not -e $srcdoc) and (-e "$dir/$srcdoc")){
  123. $srcdoc="$dir/$srcdoc";
  124. }
  125. if ((not -e $srcdoc) and (-e "$dir/$srcdoc.gz")){
  126. $srcdoc="$dir/$srcdoc.gz";
  127. }
  128. if ($srcdoc=~/\.gz$/){
  129. open SRC,"gzip -cd <$srcdoc |";
  130. }
  131. else{
  132. open SRC,"<$srcdoc";
  133. }
  134. $firstSrc=0;
  135. }
  136. }
  137. if (/toDoc=\"([^\"]+)\"/){
  138. if ($trgdoc ne $1){
  139. $trgdoc=$1;
  140. if (defined $SkipDocs){
  141. next if ($srcdoc=~/$SkipDocs/);
  142. next if ($trgdoc=~/$SkipDocs/);
  143. }
  144. if (not $firstTrg){close TRG;}
  145. if ((not -e $trgdoc) and (-e "$trgdoc.gz")){
  146. $trgdoc="$trgdoc.gz";
  147. }
  148. if ((not -e $trgdoc) and (-e "$dir/$trgdoc")){
  149. $trgdoc="$dir/$trgdoc";
  150. }
  151. if ((not -e $trgdoc) and (-e "$dir/$trgdoc.gz")){
  152. $trgdoc="$dir/$trgdoc.gz";
  153. }
  154. if ($trgdoc=~/\.gz$/){
  155. open TRG,"gzip -cd <$trgdoc |";
  156. }
  157. else{
  158. open TRG,"<$trgdoc";
  159. }
  160. $firstTrg=0;
  161. unless ($FilterMode){
  162. if ($html){print "<p>\n";}
  163. print "\n# ".$srcdoc;
  164. if ($html){print '<br>';}
  165. print "\n# ".$trgdoc."\n\n";
  166. if ($html){print "<p><hr>\n";}
  167. else{print "================================\n";}
  168. }
  169. }
  170. }
  171. if (defined $SkipDocs){
  172. next if ($srcdoc=~/$SkipDocs/);
  173. next if ($trgdoc=~/$SkipDocs/);
  174. }
  175. if (/xtargets=\"([^\"]*)\s*\;\s*([^\"]*)\"/){
  176. if (defined $LinkThr){
  177. if (/certainty=\"(.*?)\"/){
  178. next if ($1<$LinkThr);
  179. }
  180. }
  181. my $srceof=1;
  182. my $trgeof=1;
  183. $count++;
  184. if ($max and ($count>$max)){last;}
  185. my $src=$1;
  186. my $trg=$2;
  187. my @srcsent=split(/\s/,$src);
  188. my @trgsent=split(/\s/,$trg);
  189. if (defined $MaxSrc){
  190. next if (scalar @srcsent > $MaxSrc);
  191. }
  192. if (defined $MaxTrg){
  193. next if (scalar @trgsent > $MaxTrg);
  194. }
  195. my $SrcStr='';
  196. my $TrgStr='';
  197. my $oldDel=$/;
  198. $/='</s>';
  199. SRCSENT: foreach (@srcsent){
  200. while (my $sent=<SRC>){
  201. $srceof=0;
  202. if ($sent=~/s [^\>]*id="$_"/s){
  203. if ($SrcID && $sent=~/lang=\".*?\"/){
  204. next SRCSENT unless ($sent=~/lang=\"$SrcID\"/);
  205. }
  206. $sent=~s/^.*<s [^\>]*id/(src)/s;
  207. $sent=~s/\n/ /gs;
  208. $sent=~s/\<[^\>]*>//gs;
  209. $sent=~s/ +/ /gs;
  210. if ($html){$sent=&Str2Html($sent);}
  211. else{
  212. $sent=~s/\&gt\;/\>/gs;
  213. $sent=~s/\&lt\;/\</gs;
  214. $sent=~s/\&amp\;/\&/gs;
  215. }
  216. $SrcStr.=$sent;
  217. if ($html){$SrcStr.="<br>";}
  218. $SrcStr.="\n";
  219. last;
  220. }
  221. $srceof=1;
  222. }
  223. }
  224. TRGSENT: foreach (@trgsent){
  225. while (my $sent=<TRG>){
  226. $trgeof=0;
  227. if ($sent=~/s [^\>]*id="$_"/s){
  228. if ($TrgID && $sent=~/lang=\".*?\"/){
  229. next TRGSENT unless ($sent=~/lang=\"$TrgID\"/);
  230. }
  231. $sent=~s/^.*<s [^\>]*id/(trg)/s;
  232. $sent=~s/\n/ /gs;
  233. $sent=~s/\<[^\>]*>//gs;
  234. $sent=~s/ +/ /gs;
  235. if ($html){$sent=&Str2Html($sent);}
  236. else{
  237. $sent=~s/\&gt\;/\>/gs;
  238. $sent=~s/\&lt\;/\</gs;
  239. $sent=~s/\&amp\;/\&/gs;
  240. }
  241. $TrgStr.=$sent;
  242. if ($html){$TrgStr.="<br>";}
  243. $TrgStr.="\n";
  244. last;
  245. }
  246. $trgeof=1;
  247. }
  248. }
  249. if ($trgeof){
  250. close TRG;
  251. if ($trgdoc=~/\.gz$/){open TRG,"gzip -cd <$trgdoc |";}
  252. else{open TRG,"<$trgdoc";}
  253. }
  254. if ($srceof){
  255. close SRC;
  256. if ($srcdoc=~/\.gz$/){open SRC,"gzip -cd <$srcdoc |";}
  257. else{open SRC,"<$srcdoc";}
  258. }
  259. $/=$oldDel;
  260. if ($SrcStr && $TrgStr){
  261. unless ($FilterMode){
  262. print $SrcStr;
  263. print $TrgStr;
  264. if ($html){print "<hr>\n";}
  265. else{print "================================\n";}
  266. }
  267. }
  268. else{ next; }
  269. }
  270. print $_ if ($FilterMode);
  271. }
  272. if (not $firstSrc){close SRC;}
  273. if (not $firstTrg){close TRG;}
  274. close F;
  275. if ($html){&PrintHtmlTail();}
  276. sub Str2Html{
  277. my $string=shift;
  278. # $string=~s/\&/\&amp\;/gs;
  279. # $string=~s/\</\&lt\;/gs;
  280. # $string=~s/\>/\&gt\;/gs;
  281. return $string;
  282. }
  283. sub PrintHtmlHeader{
  284. print <<HEADER;
  285. <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
  286. "http://www.w3.org/TR/REC-html40/loose.dtd">
  287. <html>
  288. <head>
  289. <title>Untitled Document</title>
  290. <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
  291. </head>
  292. <body>
  293. HEADER
  294. }
  295. sub PrintHtmlTail{
  296. print <<TAIL;
  297. </body>
  298. </html>
  299. TAIL
  300. }