PageRenderTime 58ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/ems/web/analysis.php

https://bitbucket.org/ufal/mosesdecoder
PHP | 1447 lines | 1255 code | 119 blank | 73 comment | 350 complexity | 943906f8c9e3be0eb8f9f0e5c4d0f98b MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-3.0, LGPL-3.0

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. # main page frame, triggers the loading of parts
  3. function show_analysis() {
  4. global $task,$user,$setup,$id,$set;
  5. global $dir;
  6. head("Analysis: $task ($user), Set $set, Run $id");
  7. ?><script>
  8. function show(field,sort,count,filter) {
  9. var url = '?analysis=' + field + '_show'
  10. + '&setup=<?php print $setup ?>'
  11. + '&id=<?php print $id ?>'
  12. + '&set=<?php print $set ?>'
  13. + '&sort=' + sort
  14. + '&count=' + count
  15. + '&filter=' + filter;
  16. new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
  17. }
  18. function ngram_show(type,order,count,sort,smooth) {
  19. var url = '?analysis=ngram_' + type + '_show'
  20. + '&setup=<?php print $setup ?>'
  21. + '&id=<?php print $id ?>'
  22. + '&set=<?php print $set ?>'
  23. + '&order=' + order
  24. + '&smooth=' + smooth
  25. + '&sort=' + sort
  26. + '&count=' + count;
  27. var field = (type == "precision" ? "nGramPrecision" : "nGramRecall") + order;
  28. new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
  29. }
  30. function generic_show(field,parameters) {
  31. var url = '?analysis=' + field + '_show'
  32. + '&setup=<?php print $setup ?>'
  33. + '&id=<?php print $id ?>'
  34. + '&set=<?php print $set ?>'
  35. + '&' + parameters;
  36. new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
  37. }
  38. function highlight_phrase(sentence,phrase) {
  39. var input = "input-"+sentence+"-"+phrase;
  40. $(input).setStyle({ borderColor: 'red' });
  41. var output = "output-"+sentence+"-"+phrase;
  42. $(output).setStyle({ borderColor: 'red' });
  43. }
  44. function show_word_info(sentence,cc,tc,te) {
  45. var info = "info-"+sentence;
  46. document.getElementById(info).innerHTML = ''+cc+' occurrences in corpus, '+tc+' distinct translations, translation entropy: '+te;
  47. $(info).setStyle({ opacity: 1 });
  48. }
  49. function lowlight_phrase(sentence,phrase) {
  50. var input = "input-"+sentence+"-"+phrase;
  51. $(input).setStyle({ borderColor: 'black' });
  52. var output = "output-"+sentence+"-"+phrase;
  53. $(output).setStyle({ borderColor: 'black' });
  54. }
  55. function hide_word_info(sentence) {
  56. var info = "info-"+sentence;
  57. $(info).setStyle({ opacity: 0 });
  58. }
  59. function show_biconcor(sentence,phrase) {
  60. var div = "biconcor-"+sentence;
  61. var url = '?analysis=biconcor'
  62. + '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$set,$id); ?>&set=<?php print $set ?>'
  63. + '&sentence=' + sentence
  64. + '&phrase=' + encodeURIComponent(phrase);
  65. document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
  66. $(div).setStyle({ borderStyle: 'solid', 'border-width': '3px', borderColor: 'black' });
  67. new Ajax.Updater(div, url, { method: 'get', evalScripts: true });
  68. }
  69. function close_biconcor(sentence) {
  70. var div = "biconcor-"+sentence;
  71. document.getElementById(div).innerHTML = "";
  72. $(div).setStyle({ borderStyle: 'none', 'border-width': '0px', borderColor: 'white' });
  73. }
  74. </script>
  75. </head>
  76. <body>
  77. <div id="nGramSummary"><?php ngram_summary() ?></div>
  78. <div id="CoverageDetails"></div>
  79. <div id="PrecisionByCoverage"></div>
  80. <div id="PrecisionRecallDetails"></div>
  81. <div id="bleu">(loading...)</div>
  82. <script language="javascript">
  83. show('bleu','',5,'');
  84. </script>
  85. </body></html>
  86. <?php
  87. }
  88. function precision_by_coverage() {
  89. global $experiment,$evalset,$dir,$set,$id;
  90. $img_width = 1000;
  91. print "<h3>Precision of Input Words by Coverage</h3>";
  92. print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue).";
  93. print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type.";
  94. // load data
  95. $data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage"));
  96. $total = 0;
  97. $log_info = array();
  98. for($i=0;$i<count($data);$i++) {
  99. $item = split("\t",$data[$i]);
  100. $info[$item[0]]["precision"] = $item[1];
  101. $info[$item[0]]["delete"] = $item[2];
  102. $info[$item[0]]["length"] = $item[3];
  103. $info[$item[0]]["total"] = $item[4];
  104. $total += $item[4];
  105. $log_count = -1;
  106. if ($item[0]>0) {
  107. $log_count = (int) (log($item[0])/log(2));
  108. }
  109. if (!array_key_exists($log_count,$log_info)) {
  110. $log_info[$log_count]["precision"] = 0;
  111. $log_info[$log_count]["delete"] = 0;
  112. $log_info[$log_count]["length"] = 0;
  113. $log_info[$log_count]["total"] = 0;
  114. }
  115. $log_info[$log_count]["precision"] += $item[1];
  116. $log_info[$log_count]["delete"] += $item[2];
  117. $log_info[$log_count]["length"] += $item[3];
  118. $log_info[$log_count]["total"] += $item[4];
  119. }
  120. print "<h4>By log<sub>2</sub>-count in the training corpus</h4>";
  121. precision_by_coverage_graph("byCoverage",$log_info,$total,$img_width,SORT_NUMERIC);
  122. # load factored data
  123. $d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id));
  124. while (false !== ($file = $d->read())) {
  125. if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match)) {
  126. precision_by_coverage_factored($img_width,$total,$file,$match[1]);
  127. }
  128. }
  129. }
  130. function precision_by_coverage_factored($img_width,$total,$file,$factor_id) {
  131. global $dir,$set,$id;
  132. $data = file(get_current_analysis_filename("precision",$file));
  133. for($i=0;$i<count($data);$i++) {
  134. $item = split("\t",$data[$i]);
  135. $factor = $item[0];
  136. $count = $item[1];
  137. $info_factored[$factor][$count]["precision"] = $item[2];
  138. $info_factored[$factor][$count]["delete"] = $item[3];
  139. $info_factored[$factor][$count]["length"] = $item[4];
  140. $info_factored[$factor][$count]["total"] = $item[5];
  141. $info_factored_sum[$factor]["precision"] += $item[2];
  142. $info_factored_sum[$factor]["delete"] += $item[3];
  143. $info_factored_sum[$factor]["length"] += $item[4];
  144. $info_factored_sum[$factor]["total"] += $item[5];
  145. $total_factored[$factor] += $item[5];
  146. $log_count = -1;
  147. if ($count>0) {
  148. $log_count = (int) (log($count)/log(2));
  149. }
  150. $log_info_factored[$factor][$log_count]["precision"] += $item[2];
  151. $log_info_factored[$factor][$log_count]["delete"] += $item[3];
  152. $log_info_factored[$factor][$log_count]["length"] += $item[4];
  153. $log_info_factored[$factor][$log_count]["total"] += $item[5];
  154. }
  155. print "<h4>By factor ".factor_name("input",$factor_id)."</h4>";
  156. precision_by_coverage_graph("byFactor",$info_factored_sum,$total,$img_width,SORT_STRING);
  157. print "<h4>For each factor, by log<sub>2</sub>-count in the corpus</h4>";
  158. foreach ($log_info_factored as $factor => $info) {
  159. if ($total_factored[$factor]/$total > 0.01) {
  160. print "<table style=\"display:inline;\"><tr><td align=center><font size=-2><b>$factor</b></font></td></tr><tr><td align=center>";
  161. precision_by_coverage_graph("byCoverageFactor$factor",$info,$total_factored[$factor],10+2*$img_width*$total_factored[$factor]/$total,SORT_NUMERIC);
  162. print "</td></tr></table>";
  163. }
  164. }
  165. }
  166. function precision_by_word($type) {
  167. global $dir,$set,$id;
  168. $byCoverage = -2;
  169. $byFactor = "false";
  170. if ($type == "byCoverage") {
  171. $byCoverage = (int) $_GET["type"];
  172. }
  173. else if ($type == "byFactor") {
  174. $byFactor = $_GET["type"];
  175. }
  176. else if (preg_match("/byCoverageFactor(.+)/",$type,$match)) {
  177. $byCoverage = (int) $_GET["type"];
  178. $byFactor = $match[1];
  179. }
  180. $data = file(get_current_analysis_filename("precision","precision-by-input-word"));
  181. for($i=0;$i<count($data);$i++) {
  182. $line = rtrim($data[$i]);
  183. $item = split("\t",$line);
  184. //# filter for count
  185. $count = $item[4];
  186. $log_count = -1;
  187. if ($count>0) {
  188. $log_count = (int) (log($count)/log(2));
  189. }
  190. if ($byCoverage != -2 && $byCoverage != $log_count) {
  191. continue;
  192. }
  193. //# filter for factor
  194. $word = $item[5];
  195. if ($byFactor != "false" && $byFactor != $item[6]) {
  196. continue;
  197. }
  198. $info[$word]["precision"] = $item[0];
  199. $info[$word]["delete"] = $item[1];
  200. $info[$word]["length"] = $item[2];
  201. $info[$word]["total"] = $item[3];
  202. $total += $item[3];
  203. }
  204. print "<table border=1><tr><td align=center>Count</td><td align=center colspan=2>Precision</td><td align=center colspan=2>Delete</td><td align=center>Length</td></tr>\n";
  205. foreach ($info as $word => $wordinfo) {
  206. print "<tr><td align=center><a href=\"javascript:show('bleu','order',5,'".base64_encode($word)."')\">$word</a></td>";
  207. printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%.1f/%d</font></td>",$wordinfo["precision"]/$wordinfo["total"]*100,"%",$wordinfo["precision"],$wordinfo["total"]);
  208. printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%d/%d</font></td>",$wordinfo["delete"]/$wordinfo["total"]*100,"%",$wordinfo["delete"],$wordinfo["total"]);
  209. printf("<td align=right>%.3f</td>",$wordinfo["length"]/$wordinfo["total"]);
  210. print "</tr>";
  211. }
  212. print "</table>\n";
  213. }
  214. function precision_by_coverage_latex($name,$log_info,$total,$img_width,$sort_type) {
  215. $keys = array_keys($log_info);
  216. sort($keys,$sort_type);
  217. $img_width /= 100;
  218. print "<div id=\"LatexToggle$name\" onClick=\"document.getElementById('Latex$name').style.display = 'block'; this.style.display = 'none';\" style=\"display:none;\"><font size=-2>(show LaTeX)</font></div>\n";
  219. print "<div id=\"Latex$name\" style=\"display:none;\">\n";
  220. print "<code>\\begin{tikzpicture}<br>";
  221. print "% co-ordinates for precision<br>";
  222. for($line=0;$line<=9;$line++) {
  223. $height = 1.8-$line/10*1.8;
  224. print "\\draw[thin,lightgray] (0.2,-$height) ";
  225. print "node[anchor=east,black] {".$line."0\\%} -- ";
  226. print "($img_width,-$height) ;<br>\n";
  227. }
  228. print "% co-ordinates for deletion<br>\n";
  229. for($line=0;$line<=3;$line++) {
  230. $height = 2+$line/10*1.80;
  231. print "\\draw[thin,lightgray] (0.2,-$height) ";
  232. if ($line != 0) {
  233. print "node[anchor=east,black] {".$line."0\\%} ";
  234. }
  235. print "-- ($img_width,-$height) ;<br>\n";
  236. }
  237. print "% boxes<br>\n";
  238. $total_so_far = 0;
  239. foreach ($keys as $i) {
  240. $prec_ratio = $log_info[$i]["precision"]/$log_info[$i]["total"];
  241. $x = .2+($img_width-.2) * $total_so_far/$total;
  242. $y = 1.80-($prec_ratio*1.80);
  243. $width = $img_width * $log_info[$i]["total"]/$total;
  244. $height = $prec_ratio*1.80;
  245. $width += $x;
  246. $height += $y;
  247. print "\\filldraw[very thin,gray] ($x,-$y) rectangle($width,-$height) ;<br>";
  248. print "\\draw[very thin,black] ($x,-$y) rectangle($width,-$height);<br>";
  249. if ($width-$x>.1) {
  250. print "\\draw (".(($x+$width)/2).",-1.8) node[anchor=north,black] {".$i."};<br>";
  251. }
  252. $del_ratio = $log_info[$i]["delete"]/$log_info[$i]["total"];
  253. $height = $del_ratio*1.80;
  254. $height += 2;
  255. print "\\filldraw[very thin,lightgray] ($x,-2) rectangle($width,-$height);<br>\n";
  256. print "\\draw[very thin,black] ($x,-2) rectangle($width,-$height);<br>\n";
  257. $total_so_far += $log_info[$i]["total"];
  258. }
  259. print "\\end{tikzpicture}</code>";
  260. print "</div>";
  261. }
  262. function precision_by_coverage_graph($name,$log_info,$total,$img_width,$sort_type) {
  263. $keys = array_keys($log_info);
  264. sort($keys,$sort_type);
  265. print "<div id=\"Toggle$name\" onClick=\"document.getElementById('Table$name').style.display = 'none'; document.getElementById('LatexToggle$name').style.display = 'none'; document.getElementById('Latex$name').style.display = 'none'; this.style.display = 'none';\" style=\"display:none;\"><font size=-2>(hide table)</font></div>\n";
  266. precision_by_coverage_latex($name,$log_info,$total,$img_width,$sort_type);
  267. print "<div id=\"Table$name\" style=\"display:none;\">\n";
  268. print "<table border=1><tr><td align=center>Count</td><td align=center colspan=2>Precision</td><td align=center colspan=2>Delete</td><td align=center>Length</td></tr>\n";
  269. foreach ($keys as $i) {
  270. if (array_key_exists($i,$log_info)) {
  271. print "<tr><td align=center>$i</td>";
  272. printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%.1f/%d</font></td>",$log_info[$i]["precision"]/$log_info[$i]["total"]*100,"%",$log_info[$i]["precision"],$log_info[$i]["total"]);
  273. printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%d/%d</font></td>",$log_info[$i]["delete"]/$log_info[$i]["total"]*100,"%",$log_info[$i]["delete"],$log_info[$i]["total"]);
  274. printf("<td align=right>%.3f</td>",$log_info[$i]["length"]/$log_info[$i]["total"]);
  275. print "<td><A HREF=\"javascript:generic_show('PrecisionByWord$name','type=$i')\">&#x24BE;</A></td>";
  276. print "</tr>";
  277. }
  278. }
  279. print "</table><div id=\"PrecisionByWord$name\"></div></div>";
  280. print "<div id=\"Graph$name\" onClick=\"document.getElementById('Table$name').style.display = 'block'; document.getElementById('LatexToggle$name').style.display = 'block'; document.getElementById('Toggle$name').style.display = 'block';\">";
  281. print "<canvas id=\"$name\" width=$img_width height=300></canvas></div>";
  282. print "<script language=\"javascript\">
  283. var canvas = document.getElementById(\"$name\");
  284. var ctx = canvas.getContext(\"2d\");
  285. ctx.lineWidth = 0.5;
  286. ctx.font = '9px serif';
  287. ";
  288. for($line=0;$line<=9;$line++) {
  289. $height = 180-$line/10*180;
  290. print "ctx.moveTo(20, $height);\n";
  291. print "ctx.lineTo($img_width, $height);\n";
  292. if ($line != 0) {
  293. print "ctx.fillText(\"${line}0\%\", 0, $height+4);";
  294. }
  295. }
  296. for($line=0;$line<=3;$line++) {
  297. $height = 200+$line/10*180;
  298. print "ctx.moveTo(20, $height);\n";
  299. print "ctx.lineTo($img_width, $height);\n";
  300. if ($line != 0) {
  301. print "ctx.fillText(\"${line}0\%\", 0, $height+4);";
  302. }
  303. }
  304. print "ctx.strokeStyle = \"rgb(100,100,100)\"; ctx.stroke();\n";
  305. $total_so_far = 0;
  306. foreach ($keys as $i) {
  307. $prec_ratio = $log_info[$i]["precision"]/$log_info[$i]["total"];
  308. $x = (int)(20+($img_width-20) * $total_so_far / $total);
  309. $y = (int)(180-($prec_ratio*180));
  310. $width = (int)($img_width * $log_info[$i]["total"]/$total);
  311. $height = (int)($prec_ratio*180);
  312. print "ctx.fillStyle = \"rgb(200,200,0)\";";
  313. print "ctx.fillRect ($x, $y, $width, $height);";
  314. $del_ratio = $log_info[$i]["delete"]/$log_info[$i]["total"];
  315. $height = (int)($del_ratio*180);
  316. print "ctx.fillStyle = \"rgb(100,100,255)\";";
  317. print "ctx.fillRect ($x, 200, $width, $height);";
  318. $total_so_far += $log_info[$i]["total"];
  319. if ($width>3) {
  320. print "ctx.fillStyle = \"rgb(0,0,0)\";";
  321. // print "ctx.rotate(-1.5707);";
  322. print "ctx.fillText(\"$i\", $x+$width/2-3, 190);";
  323. //print "ctx.rotate(1.5707);";
  324. }
  325. }
  326. print "</script>";
  327. }
  328. //# stats on precision and recall
  329. function precision_recall_details() {
  330. ?>
  331. <table width=100%>
  332. <tr>
  333. <td width=25% valign=top><div id="nGramPrecision1">(loading...)</div></td>
  334. <td width=25% valign=top><div id="nGramPrecision2">(loading...)</div></td>
  335. <td width=25% valign=top><div id="nGramPrecision3">(loading...)</div></td>
  336. <td width=25% valign=top><div id="nGramPrecision4">(loading...)</div></td>
  337. </tr><tr>
  338. <td width=25% valign=top><div id="nGramRecall1">(loading...)</div></td>
  339. <td width=25% valign=top><div id="nGramRecall2">(loading...)</div></td>
  340. <td width=25% valign=top><div id="nGramRecall3">(loading...)</div></td>
  341. <td width=25% valign=top><div id="nGramRecall4">(loading...)</div></td>
  342. </tr></table>
  343. <script language="javascript">
  344. ngram_show('precision',1,5,'',0);
  345. ngram_show('precision',2,5,'',0);
  346. ngram_show('precision',3,5,'',0);
  347. ngram_show('precision',4,5,'',0);
  348. ngram_show('recall',1,5,'',0);
  349. ngram_show('recall',2,5,'',0);
  350. ngram_show('recall',3,5,'',0);
  351. ngram_show('recall',4,5,'',0);
  352. </script>
  353. <?php
  354. }
  355. //# stats on ngram precision
  356. function ngram_summary() {
  357. global $experiment,$evalset,$dir,$set,$id;
  358. //# load data
  359. $data = file(get_current_analysis_filename("basic","summary"));
  360. for($i=0;$i<count($data);$i++) {
  361. $item = split(": ",$data[$i]);
  362. $info[$item[0]] = $item[1];
  363. }
  364. print "<table cellspacing=5 width=100%><tr><td valign=top align=center bgcolor=#eeeeee>";
  365. //#foreach (array("precision","recall") as $type) {
  366. print "<b>Precision of Output</b>\n";
  367. $type = "precision";
  368. print "<table><tr><td>$type</td><td>1-gram</td><td>2-gram</td><td>3-gram</td><td>4-gram</td></tr>\n";
  369. printf("<tr><td>correct</td><td>%d</td><td>%d</td><td>%d</td><td>%d</td></tr>\n",
  370. $info["$type-1-correct"],
  371. $info["$type-2-correct"],
  372. $info["$type-3-correct"],
  373. $info["$type-4-correct"]);
  374. printf("<tr><td>&nbsp;</td><td>%.1f%s</td><td>%.1f%s</td><td>%.1f%s</td><td>%.1f%s</td></tr>\n",
  375. $info["$type-1-correct"]/$info["$type-1-total"]*100,'%',
  376. $info["$type-2-correct"]/$info["$type-2-total"]*100,'%',
  377. $info["$type-3-correct"]/$info["$type-3-total"]*100,'%',
  378. $info["$type-4-correct"]/$info["$type-4-total"]*100,'%');
  379. printf("<tr><td>wrong</td><td>%d</td><td>%d</td><td>%d</td><td>%d</td></tr>\n",
  380. $info["$type-1-total"]-$info["$type-1-correct"],
  381. $info["$type-2-total"]-$info["$type-2-correct"],
  382. $info["$type-3-total"]-$info["$type-3-correct"],
  383. $info["$type-4-total"]-$info["$type-4-correct"]);
  384. print "</table>";
  385. //}
  386. print "<A HREF=\"javascript:generic_show('PrecisionRecallDetails','')\">details</A> ";
  387. if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage"))) {
  388. print "| <A HREF=\"javascript:generic_show('PrecisionByCoverage','')\">precision of input by coverage</A> ";
  389. }
  390. print "</td><td valign=top valign=top align=center bgcolor=#eeeeee>";
  391. $each_score = explode(" ; ",$experiment[$id]->result[$set]);
  392. $header = "";
  393. $score_line = "";
  394. for($i=0;$i<count($each_score);$i++) {
  395. if (preg_match('/([\d\(\)\.\s]+) (BLEU[\-c]*)/',$each_score[$i],$match) ||
  396. preg_match('/([\d\(\)\.\s]+) (IBM[\-c]*)/',$each_score[$i],$match)) {
  397. $header .= "<td>$match[2]</td>";
  398. $score_line .= "<td>$match[1]</td>";
  399. }
  400. }
  401. print "<b>Metrics</b><table border=1><tr>".$header."</tr><tr>".$score_line."</tr></table>";
  402. printf("<p>length-diff: %d (%.1f%s)",$info["precision-1-total"]-$info["recall-1-total"],($info["precision-1-total"]-$info["recall-1-total"])/$info["recall-1-total"]*100,"%");
  403. // coverage
  404. if (file_exists(get_current_analysis_filename("coverage","corpus-coverage-summary"))) {
  405. print "</td><td valign=top align=center bgcolor=#eeeeee>";
  406. print "<div id=\"CoverageSummary\">";
  407. coverage_summary();
  408. print "</div>";
  409. }
  410. // phrase segmentation
  411. if (file_exists(get_current_analysis_filename("basic","segmentation")) ||
  412. file_exists(get_current_analysis_filename("basic","rule"))) {
  413. print "</td><td valign=top align=center bgcolor=#eeeeee>";
  414. print "<div id=\"SegmentationSummary\">";
  415. segmentation_summary();
  416. print "</div>";
  417. }
  418. // rules
  419. if (file_exists(get_current_analysis_filename("basic","rule"))) {
  420. print "</td><td valign=top align=center bgcolor=#eeeeee>";
  421. print "<div id=\"RuleSummary\">";
  422. rule_summary();
  423. print "</div>";
  424. }
  425. print "</td></tr></table>";
  426. }
  427. // details on ngram precision/recall
  428. function ngram_show($type) {
  429. global $set,$id,$dir;
  430. // load data
  431. $order = $_GET['order'];
  432. $data = file(get_current_analysis_filename("basic","n-gram-$type.$order"));
  433. for($i=0;$i<count($data);$i++) {
  434. $item = split("\t",$data[$i]);
  435. $line["total"] = $item[0];
  436. $line["correct"] = $item[1];
  437. $line["ngram"] = $item[2];
  438. $ngram[] = $line;
  439. }
  440. // sort option
  441. $sort = $_GET['sort'];
  442. $smooth = $_GET['smooth'];
  443. if ($sort == '') {
  444. $sort = 'ratio_worst';
  445. $smooth = 1;
  446. }
  447. // sort index
  448. for($i=0;$i<count($ngram);$i++) {
  449. if ($sort == "abs_worst") {
  450. $ngram[$i]["index"] = $ngram[$i]["correct"] - $ngram[$i]["total"];
  451. }
  452. else if ($sort == "ratio_worst") {
  453. $ngram[$i]["index"] = ($ngram[$i]["correct"] + $smooth) / ($ngram[$i]["total"] + $smooth);
  454. }
  455. }
  456. // sort
  457. function cmp($a, $b) {
  458. if ($a["index"] == $b["index"]) {
  459. return 0;
  460. }
  461. return ($a["index"] < $b["index"]) ? -1 : 1;
  462. }
  463. usort($ngram, 'cmp');
  464. // display
  465. $count = $_GET['count'];
  466. if ($count == 0) { $count = 5; }
  467. print "<B>$order-gram $type</B><br><font size=-1>sorted by ";
  468. if ($sort == "ratio_worst") {
  469. print "ratio ";
  470. print "smooth-$smooth ";
  471. print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'ratio_worst',$smooth+1)\">+</A> ";
  472. print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'ratio_worst',$smooth-1)\">-</A> ";
  473. }
  474. else {
  475. print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'ratio_worst',1)\">ratio</A> ";
  476. }
  477. if ($sort == "abs_worst") {
  478. print "absolute ";
  479. }
  480. else {
  481. print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'abs_worst',0)\">absolute</A> ";
  482. }
  483. print "showing $count ";
  484. if ($count < 9999) {
  485. print "<A HREF=\"javascript:ngram_show('$type',$order,$count+5,'$sort',$smooth)\">more</A> ";
  486. print "<A HREF=\"javascript:ngram_show('$type',$order,9999,'$sort',$smooth)\">all</A> ";
  487. }
  488. else {
  489. print "<A HREF=\"javascript:ngram_show('$type',$order,5,'$sort',$smooth)\">top5</A> ";
  490. }
  491. print "</font><br>\n";
  492. print "<table width=100%>\n";
  493. print "<tr><td>$order-gram</td><td>ok</td><td>x</td><td>ratio</td></tr>\n";
  494. for($i=0;$i<$count && $i<count($ngram);$i++) {
  495. $line = $ngram[$i];
  496. print "<tr><td>".$line["ngram"]."</td>";
  497. print "<td>".$line["correct"]."</td>";
  498. print "<td>".($line["total"]-$line["correct"])."</td>";
  499. printf("<td>%.3f</td></tr>",$line["correct"]/$line["total"]);
  500. }
  501. print "</table>\n";
  502. }
  503. // details on ngram coverage
  504. function coverage_details() {
  505. global $dir,$set,$id;
  506. $count = array(); $token = array();
  507. foreach (array("ttable","corpus") as $corpus) {
  508. foreach (array("token","type") as $b) {
  509. for($i=0;$i<=7;$i++) {
  510. foreach (array("6+","2-5","1","0") as $range) {
  511. $count[$corpus][$b][$i][$range] = 0;
  512. }
  513. $total[$corpus][$b][$i] = 0;
  514. }
  515. }
  516. $data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
  517. for($i=0;$i<count($data);$i++) {
  518. $item = split("\t",$data[$i]);
  519. if ($item[1]>5) {
  520. $count[$corpus]["type"][$item[0]]["6+"] += $item[2];
  521. $count[$corpus]["token"][$item[0]]["6+"] += $item[3];
  522. }
  523. else if ($item[1]>1) {
  524. $count[$corpus]["type"][$item[0]]["2-5"] += $item[2];
  525. $count[$corpus]["token"][$item[0]]["2-5"] += $item[3];
  526. }
  527. else if ($item[1]==1) {
  528. $count[$corpus]["type"][$item[0]]["1"] += $item[2];
  529. $count[$corpus]["token"][$item[0]]["1"] += $item[3];
  530. }
  531. else {
  532. $count[$corpus]["type"][$item[0]]["0"] += $item[2];
  533. $count[$corpus]["token"][$item[0]]["0"] += $item[3];
  534. }
  535. $total[$corpus]["type"][$item[0]] += $item[2];
  536. $total[$corpus]["token"][$item[0]] += $item[3];
  537. }
  538. }
  539. print "<b>coverage</b><br>\n";
  540. print "<table width=100%><tr>";
  541. foreach (array("token","type") as $by) {
  542. for($i=1;$i<=4;$i++) {
  543. print "<td align=center><b>$i-gram ($by)</b><br>\n";
  544. print "<table><tr><td></td><td>model</td><td>corpus</td></tr>\n";
  545. foreach (array("0","1","2-5","6+") as $range) {
  546. print "<tr><td>$range</td>";
  547. foreach (array("ttable","corpus") as $corpus) {
  548. printf("<td align=right nowrap>%d (%.1f%s)</td>",$count[$corpus][$by][$i][$range],100*$count[$corpus][$by][$i][$range]/($total[$corpus][$by][$i]+0.0001),"%");
  549. }
  550. print "</tr>\n";
  551. }
  552. print "</table></td>\n";
  553. }
  554. print "</tr><tr>";
  555. }
  556. print "</tr></table>\n";
  557. $data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","ttable-unknown")));
  558. for($i=0;$i<count($data);$i++) {
  559. list($word,$count) = split("\t",$data[$i]);
  560. $item["word"] = $word;
  561. $item["count"] = rtrim($count);
  562. $unknown[] = $item;
  563. }
  564. function cmp($a,$b) {
  565. if ($a["count"] > $b["count"]) {
  566. return -1;
  567. }
  568. else if ($a["count"] < $b["count"]) {
  569. return 1;
  570. }
  571. else {
  572. return strcmp($a["word"],$b["word"]);
  573. }
  574. }
  575. usort($unknown, 'cmp');
  576. print "<b>unknown words (to model)</b><br>\n";
  577. print "<table><tr><td valign=top><table>";
  578. $state = 5;
  579. foreach ($unknown as $item) {
  580. if ($item["count"] < $state) {
  581. if ($state == 5) { print "</table>"; }
  582. print "</td><td valign=top><b>".$item["count"].":</b> ";
  583. $state = $item["count"];
  584. if ($state == 1) { print "<font size=-1>"; }
  585. }
  586. else if ($state<5) {
  587. print ", ";
  588. }
  589. if ($state == 5) {
  590. print "<tr><td>".$item["count"]."</td><td>".$item["word"]."</td></tr>";
  591. }
  592. else {
  593. print $item["word"];
  594. }
  595. }
  596. print "</font></td></tr></table>\n";
  597. }
  598. function filename_fallback_to_factored($file) {
  599. if (file_exists($file)) {
  600. return $file;
  601. }
  602. $path = pathinfo($file);
  603. $dh = opendir($path['dirname']);
  604. while (($factored_file = readdir($dh)) !== false) {
  605. if (strlen($factored_file) > strlen($path['basename']) &&
  606. substr($factored_file,0,strlen($path['basename'])) == $path['basename'] &&
  607. preg_match("/0/",substr($factored_file,strlen($path['basename'])))) {
  608. return $path['dirname']."/".$factored_file;
  609. }
  610. }
  611. // found nothing...
  612. return $file;
  613. }
  614. function factor_name($input_output,$factor_id) {
  615. global $dir,$set,$id;
  616. $file = get_current_analysis_filename("coverage","factor-names");
  617. if (!file_exists($file)) {
  618. return $factor_id;
  619. }
  620. $in_out_names = file($file);
  621. $names = explode(",",trim($in_out_names[($input_output == "input")?0:1]));
  622. return "'".$names[$factor_id]."' ($factor_id)";
  623. }
  624. // stats on ngram coverage
  625. function coverage_summary() {
  626. global $dir,$set,$id,$corpus;
  627. if (array_key_exists("by",$_GET)) { $by = $_GET['by']; }
  628. else { $by = 'token'; }
  629. $total = array(); $count = array();
  630. foreach (array("ttable","corpus") as $corpus) {
  631. foreach (array("token","type") as $b) {
  632. foreach (array("6+","2-5","1","0") as $c) {
  633. $count[$corpus][$b][$c] = 0;
  634. }
  635. $total[$corpus][$b] = 0;
  636. }
  637. $data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
  638. for($i=0;$i<count($data);$i++) {
  639. $item = split("\t",$data[$i]);
  640. if ($item[0] == 1) {
  641. if ($item[1]>5) {
  642. $count[$corpus]["type"]["6+"] += $item[2];
  643. $count[$corpus]["token"]["6+"] += $item[3];
  644. }
  645. else if ($item[1]>1) {
  646. $count[$corpus]["type"]["2-5"] += $item[2];
  647. $count[$corpus]["token"]["2-5"] += $item[3];
  648. }
  649. else if ($item[1]==1) {
  650. $count[$corpus]["type"]["1"] += $item[2];
  651. $count[$corpus]["token"]["1"] += $item[3];
  652. }
  653. else {
  654. $count[$corpus]["type"]["0"] += $item[2];
  655. $count[$corpus]["token"]["0"] += $item[3];
  656. }
  657. $total[$corpus]["type"] += $item[2];
  658. $total[$corpus]["token"] += $item[3];
  659. }
  660. }
  661. }
  662. print "<b>Coverage</b>\n";
  663. print "<table><tr><td></td><td>model</td><td>corpus</td></tr>\n";
  664. foreach (array("0","1","2-5","6+") as $range) {
  665. print "<tr><td>$range</td>";
  666. foreach (array("ttable","corpus") as $corpus) {
  667. printf("<td align=right nowrap>%d (%.1f%s)</td>",$count[$corpus][$by][$range],100*$count[$corpus][$by][$range]/($total[$corpus][$by]+0.0001),"%");
  668. }
  669. print "</tr>\n";
  670. }
  671. print "</table>\n";
  672. if ($by == 'token') { print "by token"; } else {
  673. print "<A HREF=\"javascript:generic_show('CoverageSummary','by=token')\">by token</A> ";
  674. }
  675. print " / ";
  676. if ($by == 'type') { print "by type"; } else {
  677. print "<A HREF=\"javascript:generic_show('CoverageSummary','by=type')\">by type</A> ";
  678. }
  679. print " / ";
  680. print "<div id=\"CoverageDetailsLink\"><A HREF=\"javascript:generic_show('CoverageDetails','')\">details</A></div> ";
  681. }
  682. // stats on segmenation (phrase-based)
  683. function segmentation_summary() {
  684. global $dir,$set,$id;
  685. if (array_key_exists("by",$_GET)) { $by = $_GET['by']; }
  686. else { $by = 'word'; }
  687. $count = array();
  688. for($i=0;$i<=4;$i++) {
  689. $count[$i] = array();
  690. for($j=0;$j<=4;$j++) {
  691. $count[$i][$j] = 0;
  692. }
  693. }
  694. $total = 0;
  695. $file = get_current_analysis_filename("basic","segmentation");
  696. if (file_exists($file)) {
  697. $data = file($file);
  698. for($i=0;$i<count($data);$i++) {
  699. list($in,$out,$c) = split("\t",$data[$i]);
  700. if ($by == "word") { $c *= $in; }
  701. if ($in>4) { $in = 4; }
  702. if ($out>4) { $out = 4; }
  703. $total += $c;
  704. $count[$in][$out] += $c;
  705. }
  706. }
  707. else {
  708. $data = file(get_current_analysis_filename("basic","rule"));
  709. for($i=0;$i<count($data);$i++) {
  710. $field = split("\t",$data[$i]);
  711. $type = $field[0];
  712. $rule = $field[1];
  713. if (count($field) > 2) { $c = $field[2]; } else { $c = 0; }
  714. if ($type == "rule") {
  715. list($rule_in,$in,$nt,$rule_out,$out) = split(":",$rule);
  716. if ($by == "word") { $c *= $in; }
  717. if ($in>4) { $in = 4; }
  718. if ($out>4) { $out = 4; }
  719. $total += $c;
  720. $count[$in][$out] += $c;
  721. }
  722. }
  723. }
  724. print "<b>Phrase Segmentation</b><br>\n";
  725. print "<table>";
  726. print "<tr><td></td><td align=center>1</td><td align=center>2</td><td align=center>3</td><td align=center>4+</td></tr>";
  727. for($in=1;$in<=4;$in++) {
  728. print "<tr><td nowrap>$in".($in==4?"+":"")." to</td>";
  729. for($out=1;$out<=4;$out++) {
  730. if (array_key_exists($in,$count) &&
  731. array_key_exists($out,$count[$in])) {
  732. $c = $count[$in][$out];
  733. }
  734. else { $c = 0; }
  735. printf("<td align=right nowrap>%d (%.1f%s)</td>",$c,100*$c/$total,"%");
  736. }
  737. print "</tr>";
  738. }
  739. print "</table>\n";
  740. if ($by == 'word') { print "by word"; } else {
  741. print "<A HREF=\"javascript:generic_show('SegmentationSummary','by=word')\">by word</A> ";
  742. }
  743. print " / ";
  744. if ($by == 'phrase') { print "by phrase"; } else {
  745. print "<A HREF=\"javascript:generic_show('SegmentationSummary','by=phrase')\">by phrase</A> ";
  746. }
  747. }
  748. // hierarchical rules used in translation
  749. function rule_summary() {
  750. global $dir,$set,$id;
  751. $data = file(get_current_analysis_filename("basic","rule"));
  752. $rule = array(); $count = array(); $count_nt = array(); $count_w = array();
  753. $nt_count = 0; $total = 0;
  754. foreach ($data as $item) {
  755. $field = split("\t",$item);
  756. $type = $field[0];
  757. $d = $field[1];
  758. if (count($field) > 2) { $d2 = $field[2]; } else { $d2 = 0; }
  759. if ($type == "sentence-count") {
  760. $sentence_count = $d;
  761. }
  762. else if ($type == "glue-rule") {
  763. $glue_rule = $d / $sentence_count;
  764. }
  765. else if ($type == "depth") {
  766. $depth = $d / $sentence_count;
  767. }
  768. else {
  769. list($rule_in,$word_in,$nt,$rule_out,$word_out) = split(":",$d);
  770. $rule_in = preg_replace("/a/","x",$rule_in);
  771. $rule_in = preg_replace("/b/","y",$rule_in);
  772. $rule_in = preg_replace("/c/","z",$rule_in);
  773. $rule_out = preg_replace("/a/","x",$rule_out);
  774. $rule_out = preg_replace("/b/","y",$rule_out);
  775. $rule_out = preg_replace("/c/","z",$rule_out);
  776. $nt_count += $d2 * $nt;
  777. if (!array_key_exists($d,$rule)) { $rule[$d] = 0; }
  778. $rule[$d] += $d2;
  779. if (!array_key_exists($nt,$count)) { $count[$nt] = 0; }
  780. $count[$nt] += $d2;
  781. $just_nt = preg_replace("/\d/","",$rule_in)."-".preg_replace("/\d/","",$rule_out);
  782. $no_wc = preg_replace("/\d/","W",$rule_in)."-".preg_replace("/\d/","",$rule_out);
  783. if ($just_nt == "-") { $just_nt = "lexical"; }
  784. if (!array_key_exists($just_nt,$count_nt)) { $count_nt[$just_nt] = 0; }
  785. $count_nt[$just_nt] += $d2;
  786. if (!array_key_exists($no_wc,$count_w)) { $count_w[$no_wc] = 0; }
  787. $count_w[$no_wc] += $d2;
  788. $total += $d2;
  789. }
  790. }
  791. print "<b>Rules</b><br>\n";
  792. printf("glue rule: %.2f<br>\n",$glue_rule);
  793. printf("tree depth: %.2f<br>\n",$depth);
  794. printf("nt/rule: %.2f<br>\n",$nt_count/$total);
  795. print "<table>\n";
  796. foreach ($count_nt as $rule => $count) {
  797. printf("<tr><td>%s</td><td align=right>%d</td><td align=right>%.1f%s</td></tr>\n",$rule,$count,$count/$total*100,'%');
  798. }
  799. print "</table>\n";
  800. }
  801. // annotated sentences, navigation
  802. function bleu_show() {
  803. $count = $_GET['count'];
  804. if ($count == 0) { $count = 5; }
  805. $filter = "";
  806. if (array_key_exists("filter",$_GET)) {
  807. $filter = base64_decode($_GET['filter']);
  808. }
  809. print "<b>annotated sentences</b><br><font size=-1>sorted by: ";
  810. if ($_GET['sort'] == "order" || $_GET['sort'] == "") { print "order "; }
  811. else {
  812. print "<A HREF=\"javascript:show('bleu','order',$count,'".base64_encode($filter)."')\">order</A> ";
  813. }
  814. if ($_GET['sort'] == "best") { print "best "; }
  815. else {
  816. print "<A HREF=\"javascript:show('bleu','best',$count,'".base64_encode($filter)."')\">best</A> ";
  817. }
  818. if ($_GET['sort'] == "25") { print "25% "; }
  819. else {
  820. print "<A HREF=\"javascript:show('bleu','25',$count,'".base64_encode($filter)."')\">25%</A> ";
  821. }
  822. if ($_GET['sort'] == "avg") { print "avg "; }
  823. else {
  824. print "<A HREF=\"javascript:show('bleu','avg',$count,'".base64_encode($filter)."')\">avg</A> ";
  825. }
  826. if ($_GET['sort'] == "75") { print "75% "; }
  827. else {
  828. print "<A HREF=\"javascript:show('bleu','75',$count,'".base64_encode($filter)."')\">75%</A> ";
  829. }
  830. if ($_GET['sort'] == "worst") { print "worst; "; }
  831. else {
  832. print "<A HREF=\"javascript:show('bleu','worst',$count,'".base64_encode($filter)."')\">worst</A>; ";
  833. }
  834. print "showing: $count ";
  835. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">more</A> ";
  836. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A>";
  837. if ($filter != "") {
  838. print "; filter: '$filter'";
  839. }
  840. sentence_annotation($count,$filter);
  841. print "<p align=center><A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">5 more</A> | ";
  842. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',10+$count,'".base64_encode($filter)."')\">10 more</A> | ";
  843. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',20+$count,'".base64_encode($filter)."')\">20 more</A> | ";
  844. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',50+$count,'".base64_encode($filter)."')\">50 more</A> | ";
  845. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',100+$count,'".base64_encode($filter)."')\">100 more</A> | ";
  846. print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A> ";
  847. }
  848. // annotated sentences core: reads data, sorts sentences, displays them
  849. function sentence_annotation($count,$filter) {
  850. global $set,$id,$dir,$biconcor;
  851. # get input
  852. $filtered = array();
  853. $file = get_current_analysis_filename("coverage","input-annotation");
  854. if (file_exists($file)) {
  855. $input = file($file);
  856. # filter is so specified
  857. if ($filter != "") {
  858. for($i=0;$i<count($input);$i++) {
  859. $item = explode("\t",$input[$i]);
  860. $word = explode(" ",$item[0]);
  861. $keep = 0;
  862. for($j=0;$j<count($word);$j++) {
  863. if ($word[$j] == $filter) {
  864. $keep = 1;
  865. }
  866. }
  867. if (!$keep) { $filtered[$i] = 1; }
  868. }
  869. }
  870. }
  871. # load bleu scores
  872. $data = file(get_current_analysis_filename("basic","bleu-annotation"));
  873. for($i=0;$i<count($data);$i++) {
  874. $item = split("\t",$data[$i]);
  875. if (! array_key_exists($item[1],$filtered)) {
  876. $line["bleu"] = $item[0];
  877. $line["id"] = $item[1];
  878. $line["system"] = $item[2];
  879. $line["reference"] = "";
  880. for($j=3;$j<count($item);$j++) {
  881. if ($j>3) { $line["reference"] .= "<br>"; };
  882. $line["reference"] .= $item[$j];
  883. }
  884. $bleu[] = $line;
  885. }
  886. }
  887. # sort and label additional sentences as filtered
  888. global $sort;
  889. function cmp($a, $b) {
  890. global $sort;
  891. if ($sort == "order") {
  892. $a_idx = $a["id"];
  893. $b_idx = $b["id"];
  894. }
  895. else if ($sort == "worst" || $sort == "75") {
  896. $a_idx = $a["bleu"];
  897. $b_idx = $b["bleu"];
  898. if ($a_idx == $b_idx) {
  899. $a_idx = $b["id"];
  900. $b_idx = $a["id"];
  901. }
  902. }
  903. else if ($sort == "best" || $sort == "avg" || $sort == "25") {
  904. $a_idx = -$a["bleu"];
  905. $b_idx = -$b["bleu"];
  906. if ($a_idx == $b_idx) {
  907. $a_idx = $a["id"];
  908. $b_idx = $b["id"];
  909. }
  910. }
  911. if ($a_idx == $b_idx) {
  912. return 0;
  913. }
  914. return ($a_idx < $b_idx) ? -1 : 1;
  915. }
  916. $sort = $_GET['sort'];
  917. if ($sort == '') {
  918. $sort = "order";
  919. }
  920. usort($bleu, 'cmp');
  921. $offset = 0;
  922. if ($sort == "25" || $sort == "75") {
  923. $offset = (int) (count($bleu)/4);
  924. }
  925. else if ($sort == "avg") {
  926. $offset = (int) (count($bleu)/2);
  927. }
  928. $retained = array();
  929. for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
  930. $line = $bleu[$i];
  931. $retained[$line["id"]] = 1;
  932. }
  933. # get segmentation (phrase alignment)
  934. $file = get_current_analysis_filename("basic","segmentation-annotation");
  935. if (file_exists($file)) {
  936. $data = file($file);
  937. for($i=0;$i<count($data);$i++) {
  938. if ($filter == "" || array_key_exists($i,$retained)) {
  939. $segment = 0;
  940. foreach (split(" ",$data[$i]) as $item) {
  941. list($in_start,$in_end,$out_start,$out_end) = split(":",$item);
  942. $segment++;
  943. $segmentation[$i]["input_start"][$in_start] = $segment;
  944. $segmentation[$i]["input_end"][$in_end] = $segment;
  945. $segmentation[$i]["output_start"][$out_start] = $segment;
  946. $segmentation[$i]["output_end"][$out_end+0] = $segment;
  947. }
  948. }
  949. }
  950. }
  951. # get hierarchical data
  952. $hierarchical = 0;
  953. $file = get_current_analysis_filename("basic","input-tree");
  954. if (file_exists($file)) {
  955. $data = file($file);
  956. $span = 0;
  957. $last_sentence = -1;
  958. $nt_count = array();
  959. for($i=0;$i<count($data);$i++) {
  960. list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
  961. if ($sentence != $last_sentence) { $span = 0; }
  962. $last_sentence = $sentence;
  963. if ($filter == "" || array_key_exists($sentence,$retained)) {
  964. $segmentation[$sentence][$span]["brackets"] = $brackets;
  965. # $segmentation[$sentence][$span]["nt"] = $nt;
  966. $segmentation[$sentence][$span]["words"] = rtrim($words);
  967. if ($nt != "") { $nt_count[$nt]=1; }
  968. $span++;
  969. }
  970. }
  971. $hierarchical = 1;
  972. # if (count($nt_count) <= 2) {
  973. # foreach ($segmentation as $sentence => $segmentation_span) {
  974. # foreach ($segmentation_span as $span => $type) {
  975. # $segmentation[$sentence][$span]["nt"]="";
  976. # }
  977. # }
  978. # }
  979. }
  980. $file = get_current_analysis_filename("basic","output-tree");
  981. if (file_exists($file)) {
  982. $data = file($file);
  983. $span = 0;
  984. $last_sentence = -1;
  985. $nt_count = array();
  986. for($i=0;$i<count($data);$i++) {
  987. list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
  988. if ($sentence != $last_sentence) { $span = 0; }
  989. $last_sentence = $sentence;
  990. if ($filter == "" || array_key_exists($sentence,$retained)) {
  991. $segmentation_out[$sentence][$span]["brackets"] = $brackets;
  992. $segmentation_out[$sentence][$span]["nt"] = $nt;
  993. $segmentation_out[$sentence][$span]["words"] = rtrim($words);
  994. if ($nt != "") { $nt_count[$nt]=1; }
  995. $span++;
  996. }
  997. }
  998. # no non-terminal markup, if there are two or less non-terminals (X,S)
  999. if (count($nt_count) <= 2) {
  1000. foreach ($segmentation_out as $sentence => $segmentation_span) {
  1001. foreach ($segmentation_span as $span => $type) {
  1002. $segmentation_out[$sentence][$span]["nt"]="";
  1003. }
  1004. }
  1005. }
  1006. }
  1007. $file = get_current_analysis_filename("basic","node");
  1008. if (file_exists($file)) {
  1009. $data = file($file);
  1010. $n = 0;
  1011. $last_sentence = -1;
  1012. for($i=0;$i<count($data);$i++) {
  1013. list($sentence,$depth,$start_div,$end_div,$start_div_in,$end_div_in,$children) = split(" ",$data[$i]);
  1014. if ($sentence != $last_sentence) { $n = 0; }
  1015. $last_sentence = $sentence;
  1016. if ($filter == "" || array_key_exists($sentence,$retained)) {
  1017. $node[$sentence][$n]['depth'] = $depth;
  1018. $node[$sentence][$n]['start_div'] = $start_div;
  1019. $node[$sentence][$n]['end_div'] = $end_div;
  1020. $node[$sentence][$n]['start_div_in'] = $start_div_in;
  1021. $node[$sentence][$n]['end_div_in'] = $end_div_in;
  1022. $node[$sentence][$n]['children'] = rtrim($children);
  1023. $n++;
  1024. }
  1025. }
  1026. }
  1027. # display
  1028. if ($filter != "") {
  1029. print " (".(count($input)-count($filtered))." retaining)";
  1030. }
  1031. print "</font><BR>\n";
  1032. $biconcor = get_biconcor_version($dir,$set,$id);
  1033. //print "<div id=\"debug\">$sort / $offset</div>";
  1034. for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
  1035. $line = $bleu[$i];
  1036. if ($hierarchical) {
  1037. annotation_hierarchical($line["id"],$segmentation[$line["id"]],$segmentation_out[$line["id"]],$node[$line["id"]]);
  1038. }
  1039. if ($input) {
  1040. print "<div id=\"info-".$line["id"]."\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">0 occ. in corpus, 0 translations, entropy: 0.00</div>\n";
  1041. if ($biconcor) {
  1042. print "<div id=\"biconcor-".$line["id"]."\" class=\"biconcor\"><font size=-2>(click on input phrase for bilingual concordancer)</font></div>";
  1043. }
  1044. if ($hierarchical) {
  1045. sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
  1046. }
  1047. else {
  1048. print "<font size=-2>[#".$line["id"]."]</font> ";
  1049. input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]],$filter);
  1050. }
  1051. }
  1052. //else {
  1053. // print "<font size=-2>[".$line["id"].":".$line["bleu"]."]</font> ";
  1054. //}
  1055. if ($hierarchical) {
  1056. sentence_annotation_hierarchical($line["bleu"],$line["id"],$line["system"],$segmentation_out[$line["id"]],"out");
  1057. }
  1058. else {
  1059. print "<font size=-2>[".$line["bleu"]."]</font> ";
  1060. output_annotation($line["id"],$line["system"],$segmentation[$line["id"]]);
  1061. }
  1062. print "<br><font size=-2>[ref]</font> ".$line["reference"]."<hr>";
  1063. }
  1064. }
  1065. function coverage($coverage_vector) {
  1066. # get information from line in input annotation file
  1067. $coverage = array();
  1068. foreach (split(" ",$coverage_vector) as $item) {
  1069. if (preg_match("/[\-:]/",$item)) {
  1070. $field = preg_split("/[\-:]/",$item);
  1071. $from = $field[0];
  1072. $to = $field[1];
  1073. if (count($field)>2){ $coverage[$from][$to]["corpus_count"]=$field[2]; }
  1074. if (count($field)>3){ $coverage[$from][$to]["ttable_count"]=$field[3]; }
  1075. if (count($field)>4){ $coverage[$from][$to]["ttabel_entropy"]=$field[4]; }
  1076. }
  1077. }
  1078. return $coverage;
  1079. }
  1080. // annotate an inpute sentence
  1081. function input_annotation($sentence,$input,$segmentation,$filter) {
  1082. global $biconcor;
  1083. list($words,$coverage_vector) = split("\t",$input);
  1084. # get information from line in input annotation file
  1085. $coverage = array();
  1086. foreach (split(" ",$coverage_vector) as $item) {
  1087. if (preg_match("/[\-:]/",$item)) {
  1088. list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
  1089. $coverage[$from][$to]["corpus_count"] = $corpus_count;
  1090. $coverage[$from][$to]["ttable_count"] = $ttable_count;
  1091. $coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
  1092. }
  1093. }
  1094. $word = split(" ",$words);
  1095. # compute the display level for each input phrase
  1096. for($j=0;$j<count($word);$j++) {
  1097. $box[] = array();
  1098. $separable[] = 1;
  1099. }
  1100. $max_level = 0;
  1101. for($length=1;$length<=7;$length++) {
  1102. for($from=0;$from<count($word)-($length-1);$from++) {
  1103. $to = $from + ($length-1);
  1104. if (array_key_exists($from,$coverage) &&
  1105. array_key_exists($to,$coverage[$from]) &&
  1106. array_key_exists("corpus_count",$coverage[$from][$to])) {
  1107. $level=0;
  1108. $available = 0;
  1109. while(!$available) {
  1110. $available = 1;
  1111. $level++;
  1112. for($j=$from;$j<=$to;$j++) {
  1113. if (array_key_exists($level,$box) &&
  1114. array_key_exists($j,$box[$level])) {
  1115. $available = 0;
  1116. }
  1117. }
  1118. }
  1119. for($j=$from;$j<=$to;$j++) {
  1120. $box[$level][$j] = $to;
  1121. }
  1122. $max_level = max($max_level,$level);
  1123. for($j=$from+1;$j<=$to;$j++) {
  1124. $separable[$j] = 0;
  1125. }
  1126. }
  1127. }
  1128. }
  1129. $separable[count($word)] = 1;
  1130. # display input phrases
  1131. $sep_start = 0;
  1132. for($sep_end=1;$sep_end<=count($word);$sep_end++) {
  1133. if ($separable[$sep_end] == 1) {
  1134. # one table for each separable block
  1135. print "<table cellpadding=1 cellspacing=0 border=0 style=\"display: inline;\">";
  1136. for($level=$max_level;$level>=1;$level--) {
  1137. # rows for phrase display
  1138. print "<tr style=\"height:5px;\">";
  1139. for($from=$sep_start;$from<$sep_end;$from++) {
  1140. if (array_key_exists($from,$box[$level])) {
  1141. $to = $box[$level][$from];
  1142. $size = $to - $from + 1;
  1143. if ($size == 1) {
  1144. print "<td><div style=\"height:0px; opacity:0; position:relative; z-index:-9;\">".$word[$from];
  1145. }
  1146. else {
  1147. $color = coverage_color($coverage[$from][$to]);
  1148. $phrase = "";
  1149. $highlightwords = "";
  1150. $lowlightwords = "";
  1151. for($j=$from;$j<=$to;$j++) {
  1152. if ($j>$from) { $phrase .= " "; }
  1153. $phrase .= $word[$j];
  1154. $highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';";
  1155. $lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
  1156. }
  1157. print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">";
  1158. }
  1159. print "</div></td>";
  1160. $from += $size-1;
  1161. }
  1162. else {
  1163. print "<td><div style=\"height:".($from==$to ? 0 : 3)."px;\"></div></td>";
  1164. }
  1165. }
  1166. print "</tr>\n";
  1167. }
  1168. # display input words
  1169. print "<tr><td colspan=".($sep_end-$sep_start)."><div style=\"position:relative; z-index:1;\">";
  1170. for($j=$sep_start;$j<$sep_end;$j++) {
  1171. if ($segmentation && array_key_exists($j,$segmentation["input_start"])) {
  1172. $id = $segmentation["input_start"][$j];
  1173. print "<span id=\"input-$sentence-$id\" style=\"border-color:#000000; border-style:solid; border-width:1px;\" onmouseover=\"highlight_phrase($sentence,$id);\" onmouseout=\"lowlight_phrase($sentence,$id);\">";
  1174. }
  1175. if (array_key_exists($j,$coverage)) {
  1176. $color = coverage_color($coverage[$j][$j]);
  1177. $cc = $coverage[$j][$j]["corpus_count"];
  1178. $tc = $coverage[$j][$j]["ttable_count"];
  1179. $te = $coverage[$j][$j]["ttable_entropy"];
  1180. }
  1181. else { # unknown words
  1182. $color = '#ffffff';
  1183. $cc = 0; $tc = 0; $te = 0;
  1184. }
  1185. print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($word[$j])."');\"":"").">";
  1186. if ($word[$j] == $filter) {
  1187. print "<b><font color=#ff0000>".$word[$j]."</font></b>";
  1188. }
  1189. else {
  1190. print $word[$j];
  1191. }
  1192. print "</span>";
  1193. if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
  1194. print "</span>";
  1195. }
  1196. print " ";
  1197. }
  1198. print "</div></td></tr>\n";
  1199. print "</table>\n";
  1200. $sep_start = $sep_end;
  1201. }
  1202. }
  1203. print "<br>";
  1204. }
  1205. // color-coded coverage stats (corpus count, ttable count, entropy)
  1206. function coverage_color($phrase) {
  1207. $corpus_count = 255 - 10 * log(1 + $phrase["corpus_count"]);
  1208. if ($corpus_count < 128) { $corpus_count = 128; }
  1209. $cc_color = dechex($corpus_count / 16) . dechex($corpus_count % 16);
  1210. $ttable_count = 255 - 20 * log(1 + $phrase["ttable_count"]);
  1211. if ($ttable_count < 128) { $ttable_count = 128; }
  1212. $tc_color = dechex($ttable_count / 16) . dechex($ttable_count % 16);
  1213. $ttable_entropy = 255 - 32 * $phrase["ttable_entropy"];
  1214. if ($ttable_entropy < 128) { $ttable_entropy = 128; }
  1215. $te_color = dechex($ttable_entropy / 16) . dechex($ttable_entropy % 16);
  1216. // $color = "#". $cc_color . $te_color . $tc_color; # reddish browns with some green
  1217. // $color = "#". $cc_color . $tc_color . $te_color; # reddish brown with some blueish purple
  1218. $color = "#". $te_color . $cc_color . $tc_color; # pale green towards red
  1219. // $color = "#". $te_color . $tc_color . $cc_color; # pale purple towards red
  1220. // $color = "#". $tc_color . $te_color . $cc_color; // # blue-grey towards green
  1221. // $color = "#". $tc_color . $cc_color . $te_color; // # green-grey towards blue
  1222. return $color;
  1223. }
  1224. // annotate an output sentence
  1225. function output_annotation($sentence,$system,$segmentation) {
  1226. #$color = array("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
  1227. $color = array("#c0c0c0","#e0e0ff","#b0b0ff","#8080ff","#4040ff");
  1228. $word = split(" ",$system);
  1229. for($j=0;$j<count($word);$j++) {
  1230. list($surface,$correct) = split("\|", $word[$j]);
  1231. if ($segmentation && array_key_exists($j,$segmentation["output_start"])) {
  1232. $id = $segmentation["output_start"][$j];
  1233. print "<span id=\"output-$sentence-$id\" style=\"border-color:#000000; border-style:solid; border-width:1px;\" onmouseover=\"highlight_phrase($sentence,$id);\" onmouseout=\"lowlight_phrase($sentence,$id);\">";
  1234. }
  1235. print "<span style=\"background-color: $color[$correct]\">$surface</span>";
  1236. if ($segmentation && array_key_exists($j,$segmentation["output_end"])) {
  1237. print "</span>";
  1238. }
  1239. print " ";
  1240. }
  1241. }
  1242. function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node) {
  1243. print "<script language=\"javascript\">\n";
  1244. print "max_depth[$sentence] = ".strlen($segmentation[0]["brackets"]).";\n";
  1245. print "span_count_out[$sentence] = ".count($segmentation_out).";\n";
  1246. print "span_count_in[$sentence] = ".count($segmentation).";\n";
  1247. print "nodeIn[$sentence] = [];\n";
  1248. print "nodeOut[$sentence] = [];\n";
  1249. print "nodeChildren[$sentence] = [];\n";
  1250. for($n=0;$n<count($node);$n++) {
  1251. print "nodeIn[$sentence].push({ start: ".$node[$n]['start_div_in'].", end: ".$node[$n]['…

Large files files are truncated, but you can click here to view the full file