PageRenderTime 56ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/wp-content/plugins/transposh-translation-filter-for-wordpress/core/parser.php

https://bitbucket.org/lgorence/quickpress
PHP | 948 lines | 544 code | 88 blank | 316 comment | 232 complexity | 32a23d6032cc9dba76bf9bf436cf784c MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, AGPL-1.0
  1. <?php
  2. /*
  3. * Transposh v0.8.3
  4. * http://transposh.org/
  5. *
  6. * Copyright 2012, Team Transposh
  7. * Licensed under the GPL Version 2 or higher.
  8. * http://transposh.org/license
  9. *
  10. * Date: Mon, 28 May 2012 14:38:35 +0300
  11. */
  12. require_once("shd/simple_html_dom.php");
  13. require_once("constants.php");
  14. require_once("utils.php");
  15. define('PUNCT_BREAKS', TRUE); // Will punctiations such as , . ( and such will break a phrase
  16. define('NUM_BREAKS', TRUE); // Will a number break a phrase
  17. define('ENT_BREAKS', TRUE); // Will an HTML entity break a phrase
  18. /**
  19. * parserstats class - holds parser statistics
  20. */
  21. class parserstats {
  22. /** @var int Holds the total phrases the parser encountered */
  23. public $total_phrases;
  24. /** @var int Holds the number of phrases that had translation */
  25. public $translated_phrases;
  26. /** @var int Holds the number of phrases that had human translation */
  27. public $human_translated_phrases;
  28. /** @var int Holds the number of phrases that are hidden - yet still somewhat viewable (such as the title attribure) */
  29. public $hidden_phrases;
  30. /** @var int Holds the number of phrases that are hidden and translated */
  31. public $hidden_translated_phrases;
  32. /** @var int Holds the amounts of hidden spans created for translation */
  33. public $hidden_translateable_phrases;
  34. /** @var int Holds the number of phrases that are hidden and probably won't be viewed - such as meta keys */
  35. public $meta_phrases;
  36. /** @var int Holds the number of translated phrases that are hidden and probably won't be viewed - such as meta keys */
  37. public $meta_translated_phrases;
  38. /** @var float Holds the time translation took */
  39. public $time;
  40. /** @var int Holds the time translation started */
  41. private $start_time;
  42. /**
  43. * This function is when the object is initialized, which is a good time to start ticking.
  44. */
  45. function parserstats() {
  46. $this->start_time = microtime(true);
  47. }
  48. /**
  49. * Calculated values - computer translated phrases
  50. * @return int How many phrases were auto-translated
  51. */
  52. function get_computer_translated_phrases() {
  53. return $this->translated_phrases - $this->human_translated_phrases;
  54. }
  55. /**
  56. * Calculated values - missing phrases
  57. * @return int How many phrases are missing
  58. */
  59. function get_missing_phrases() {
  60. return $this->total_phrases - $this->translated_phrases;
  61. }
  62. /**
  63. * Start the timer
  64. */
  65. function start_timing() {
  66. $this->start_time = microtime(true);
  67. }
  68. /**
  69. * Stop timing, store time for reference
  70. */
  71. function stop_timing() {
  72. $this->time = number_format(microtime(true) - $this->start_time, 3);
  73. }
  74. }
  75. /**
  76. * Parser class - allows phrase marking and translation with callback functions
  77. */
  78. class parser {
  79. // funnctions that need to be defined... //
  80. public $url_rewrite_func = null;
  81. public $fetch_translate_func = null;
  82. public $prefetch_translate_func = null;
  83. public $split_url_func = null;
  84. /** @var int stores the number of the last used span_id */
  85. private $span_id = 0;
  86. /** @var simple_html_dom_node Contains the current node */
  87. private $currentnode;
  88. /** @var simple_html_dom Contains the document dom model */
  89. private $html;
  90. // the document
  91. public $dir_rtl;
  92. /** @var string Contains the iso of the target language */
  93. public $lang;
  94. /** @var boolean Contains the fact that this language is the default one (only parse other lanaguage spans) */
  95. public $default_lang = false;
  96. /** @var string Contains the iso of the source language - if a lang attribute is found, assumed to be en by default */
  97. public $srclang;
  98. private $inbody = false;
  99. /** @var hold fact that we are in select or other similar elements */
  100. private $inselect = false;
  101. public $is_edit_mode;
  102. public $is_auto_translate;
  103. public $feed_fix;
  104. /** @var boolean should we attempt to handle page as json */
  105. public $might_json = false;
  106. public $allow_ad = false;
  107. //first three are html, later 3 come from feeds xml (link is problematic...)
  108. protected $ignore_tags = array('script' => 1, 'style' => 1, 'code' => 1, 'wfw:commentrss' => 1, 'comments' => 1, 'guid' => 1);
  109. /** @var parserstats Contains parsing statistics */
  110. private $stats;
  111. /** @var boolean Are we inside a translated gettext */
  112. private $in_get_text = false;
  113. /** @var boolean Are we inside an inner text %s in gettext */
  114. private $in_get_text_inner = false;
  115. /** @var string Additional header information */
  116. public $added_header;
  117. /** @var array Contains reference to changable a tags */
  118. private $atags = array();
  119. /** @var array Contains reference to changable option values */
  120. private $otags = array();
  121. private $edit_span_created = false;
  122. /**
  123. * Determine if the current position in buffer is a white space.
  124. * @param char $char
  125. * @return boolean true if current position marks a white space
  126. */
  127. function is_white_space($char) {
  128. if (!$char) return TRUE;
  129. return strspn($char, " \t\r\n\0\x0B");
  130. }
  131. /**
  132. * Determine if the current position in page points to a character in the
  133. * range of a-z (case insensetive).
  134. * @return boolean true if a-z
  135. */
  136. function is_a_to_z_character($char) {
  137. return (($char >= 'a' && $char <= 'z') || ($char >= 'A' && $char <= 'Z')) ? true : false;
  138. }
  139. /**
  140. * Determine if the current position is a digit.
  141. * @return boolean true if a digit
  142. */
  143. function is_digit($char) {
  144. return (($char >= '0' && $char <= '9')) ? true : false;
  145. }
  146. /**
  147. * Determine if the current position is an html entity - such as &amp; or &#8220;.
  148. * @param string $string string to evalute
  149. * @param int $position where to check for entities
  150. * @return int length of entity
  151. */
  152. function is_html_entity($string, $position) {
  153. if ($string[$position] == '&') {
  154. $end_pos = $position + 1;
  155. while ($string[$end_pos] == '#' || $this->is_digit($string[$end_pos]) || $this->is_a_to_z_character($string[$end_pos]))
  156. ++$end_pos;
  157. if ($string[$end_pos] == ';') return $end_pos - $position + 1;
  158. }
  159. return 0;
  160. }
  161. /**
  162. * Some entities will not cause a break if they don't have whitespace after them
  163. * such as Jack`s apple.
  164. * `uncatagorized` will break on the later entity
  165. * Added " quotes to this claim, as it is used in some languages in a similar fashion
  166. * @param string $entity - html entity to check
  167. * @return boolean true if not a breaker (apostrophy)
  168. */
  169. function is_entity_breaker($entity) {
  170. return!(stripos('&#8217;&apos;&quot;&#039;&#39;&rsquo;&lsquo;&rdquo;&ldquo;', $entity) !== FALSE);
  171. }
  172. /**
  173. * Some entities are to be regarded as simple letters in most cases
  174. &Agrave; &#192; À À latin capital letter A with grave
  175. &Aacute; &#193; Á Á latin capital letter A with acute
  176. &Acirc; &#194; Â Â latin capital letter A with circumflex
  177. &Atilde; &#195; Ã Ã latin capital letter A with tilde
  178. &Auml; &#196; Ä Ä latin capital letter A with diaeresis
  179. &Aring; &#197; Å Å latin capital letter A with ring above
  180. &AElig; &#198; Æ Æ latin capital letter AE
  181. &Ccedil; &#199; Ç Ç latin capital letter C with cedilla
  182. &Egrave; &#200; È È latin capital letter E with grave
  183. &Eacute; &#201; É É latin capital letter E with acute
  184. &Ecirc; &#202; Ê Ê latin capital letter E with circumflex
  185. &Euml; &#203; Ë Ë latin capital letter E with diaeresis
  186. &Igrave; &#204; Ì Ì latin capital letter I with grave
  187. &Iacute; &#205; Í Í latin capital letter I with acute
  188. &Icirc; &#206; Î Î latin capital letter I with circumflex
  189. &Iuml; &#207; Ï Ï latin capital letter I with diaeresis
  190. &ETH; &#208; Ð Ð latin capital letter ETH
  191. &Ntilde; &#209; Ñ Ñ latin capital letter N with tilde
  192. &Ograve; &#210; Ò Ò latin capital letter O with grave
  193. &Oacute; &#211; Ó Ó latin capital letter O with acute
  194. &Ocirc; &#212; Ô Ô latin capital letter O with circumflex
  195. &Otilde; &#213; Õ Õ latin capital letter O with tilde
  196. &Ouml; &#214; Ö Ö latin capital letter O with diaeresis
  197. //&times; &#215; × × multiplication sign
  198. &Oslash; &#216; Ø Ø latin capital letter O with stroke
  199. &Ugrave; &#217; Ù Ù latin capital letter U with grave
  200. &Uacute; &#218; Ú Ú latin capital letter U with acute
  201. &Ucirc; &#219; Û Û latin capital letter U with circumflex
  202. &Uuml; &#220; Ü Ü latin capital letter U with diaeresis
  203. &Yacute; &#221; Ý Ý latin capital letter Y with acute
  204. &THORN; &#222; Þ Þ latin capital letter THORN
  205. &szlig; &#223; ß ß latin small letter sharp s
  206. &agrave; &#224; à à latin small letter a with grave
  207. &aacute; &#225; á á latin small letter a with acute
  208. &acirc; &#226; â â latin small letter a with circumflex
  209. &atilde; &#227; ã ã latin small letter a with tilde
  210. &auml; &#228; ä ä latin small letter a with diaeresis
  211. &aring; &#229; å å latin small letter a with ring above
  212. &aelig; &#230; æ æ latin small letter ae
  213. &ccedil; &#231; ç ç latin small letter c with cedilla
  214. &egrave; &#232; è è latin small letter e with grave
  215. &eacute; &#233; é é latin small letter e with acute
  216. &ecirc; &#234; ê ê latin small letter e with circumflex
  217. &euml; &#235; ë ë latin small letter e with diaeresis
  218. &igrave; &#236; ì ì latin small letter i with grave
  219. &iacute; &#237; í í latin small letter i with acute
  220. &icirc; &#238; î î latin small letter i with circumflex
  221. &iuml; &#239; ï ï latin small letter i with diaeresis
  222. &eth; &#240; ð ð latin small letter eth
  223. &ntilde; &#241; ñ ñ latin small letter n with tilde
  224. &ograve; &#242; ò ò latin small letter o with grave
  225. &oacute; &#243; ó ó latin small letter o with acute
  226. &ocirc; &#244; ô ô latin small letter o with circumflex
  227. &otilde; &#245; õ õ latin small letter o with tilde
  228. &ouml; &#246; ö ö latin small letter o with diaeresis
  229. //&divide; &#247; ÷ ÷ division sign
  230. &oslash; &#248; ø ø latin small letter o with stroke
  231. &ugrave; &#249; ù ù latin small letter u with grave
  232. &uacute; &#250; ú ú latin small letter u with acute
  233. &ucirc; &#251; û û latin small letter u with circumflex
  234. &uuml; &#252; ü ü latin small letter u with diaeresis
  235. &yacute; &#253; ý ý latin small letter y with acute
  236. &thorn; &#254; þ þ latin small letter thorn
  237. &yuml; &#255; ÿ ÿ latin small letter y with diaeresis
  238. Latin-1 extended
  239. &OElig; &#338; latin capital ligature OE
  240. &oelig; &#339; latin small ligature oe
  241. &Scaron; &#352; latin capital letter S with caron
  242. &scaron; &#353; latin small letter s with caron
  243. &Yuml; &#376; latin capital letter Y with diaeresis
  244. */
  245. function is_entity_letter($entity) {
  246. $entnum = (int) substr($entity, 2);
  247. if (($entnum >= 192 && $entnum <= 214) || ($entnum >= 216 && $entnum <= 246) || ($entnum >= 248 && $entnum <= 255)
  248. || $entnum == 338 || $entnum == 339|| $entnum == 352|| $entnum == 353|| $entnum == 376) {
  249. return true;
  250. }
  251. $entities = '&Agrave;&Aacute;&Acirc;&Atilde;&Auml;&Aring;&AElig;&Ccedil;&Egrave;&Eacute;&Ecirc;&Euml;&Igrave;&Iacute;&Icirc;&Iuml;&ETH;' .
  252. '&Ntilde;&Ograve;&Oacute;&Ocirc;&Otilde;&Ouml;&Oslash;&Ugrave;&Uacute;&Ucirc;&Uuml;&Yacute;&THORN;&szlig;' .
  253. '&oslash;&ugrave;&yuml;&oelig;&scaron;';
  254. return (stripos($entities, $entity) !== FALSE);
  255. }
  256. /**
  257. * Determine if the current position in buffer is a sentence breaker, e.g. '.' or ',' .
  258. * Note html markups are not considered sentence breaker within the scope of this function.
  259. * @param char $char charcter checked if breaker
  260. * @param char $nextchar needed for checking if . or - breaks
  261. * @return int length of breaker if current position marks a break in sentence
  262. */
  263. function is_sentence_breaker($char, $nextchar, $nextnextchar) {
  264. if (($char == '.' || $char == '-') && ($this->is_white_space($nextchar)))
  265. return 1;
  266. //?
  267. if (ord($char) == 239 && ord($nextchar) == 188 && ord($nextnextchar) == 140)
  268. return 3;
  269. //?
  270. if (ord($char) == 226 && ord($nextchar) == 136 && ord($nextnextchar) == 153)
  271. return 3;
  272. //·
  273. if (ord($char) == 194 && ord($nextchar) == 183) return 2;
  274. return (strpos(',?()[]{}"!:|;' . TP_GTXT_BRK . TP_GTXT_BRK_CLOSER . TP_GTXT_IBRK . TP_GTXT_IBRK_CLOSER, $char) !== false) ? 1 : 0; // TODO: might need to add < and > here
  275. }
  276. /**
  277. * Determines if the current position marks the begining of a number, e.g. 123 050-391212232
  278. * @return int length of number.
  279. */
  280. function is_number($page, $position) {
  281. return strspn($page, '0123456789-+$%#*,.\\/', $position);
  282. }
  283. /**
  284. * Create a phrase tag in the html dom tree
  285. * @param int $start - beginning of phrase in element
  286. * @param int $end - end of phrase in element
  287. */
  288. function tag_phrase($string, $start, $end) {
  289. $phrase = trim(substr($string, $start, $end - $start));
  290. // $logstr = str_replace(array(chr(1),chr(2),chr(3),chr(4)), array('[1]','[2]','[3]','[4]'), $string);
  291. //
  292. //
  293. if ($this->in_get_text > $this->in_get_text_inner) {
  294. return;
  295. }
  296. if ($phrase) {
  297. $node = new simple_html_dom_node($this->html);
  298. $node->tag = 'phrase';
  299. $node->parent = $this->currentnode;
  300. $this->currentnode->nodes[] = $node;
  301. $node->_[HDOM_INFO_OUTER] = '';
  302. $node->phrase = $phrase;
  303. $node->start = $start;
  304. $node->len = strlen($phrase);
  305. if ($this->srclang) $node->srclang = $this->srclang;
  306. if ($this->inbody) $node->inbody = $this->inbody;
  307. if ($this->inselect) $node->inselect = true;
  308. }
  309. }
  310. /**
  311. * Breaks strings into substring according to some rules and common sense
  312. * @param string $string - the string which is "broken" into smaller strings
  313. */
  314. function parsetext($string) {
  315. $pos = 0;
  316. // $pos = skip_white_space($string, $pos);
  317. // skip CDATA in feed_fix mode
  318. if ($this->feed_fix) {
  319. if (strpos($string, '<![CDATA[') === 0) {
  320. $pos = 9; // CDATA length
  321. $string = substr($string, 0, -3); // chop the last ]]>;
  322. }
  323. }
  324. $start = $pos;
  325. while ($pos < strlen($string)) {
  326. // Some HTML entities make us break, almost all but apostrophies
  327. if (ENT_BREAKS && $len_of_entity = $this->is_html_entity($string, $pos)) {
  328. $entity = substr($string, $pos, $len_of_entity);
  329. if (($this->is_white_space(@$string[$pos + $len_of_entity]) || $this->is_entity_breaker($entity)) && !$this->is_entity_letter($entity)) {
  330. $this->tag_phrase($string, $start, $pos);
  331. $start = $pos + $len_of_entity;
  332. }
  333. //skip past entity
  334. $pos += $len_of_entity;
  335. }
  336. // we have a special case for <> tags which might have came to us (maybe in xml feeds) (we'll skip them...)
  337. elseif ($string[$pos] == '<') {
  338. $this->tag_phrase($string, $start, $pos);
  339. while ($string[$pos] != '>' && $pos < strlen($string))
  340. $pos++;
  341. $pos++;
  342. $start = $pos;
  343. } elseif ($string[$pos] == TP_GTXT_BRK || $string[$pos] == TP_GTXT_BRK_CLOSER) {
  344. // $logstr = str_replace(array(chr(1),chr(2),chr(3),chr(4)), array('[1]','[2]','[3]','[4]'), $string);
  345. // $closers = ($string[$pos] == TP_GTXT_BRK) ? '': 'closer';
  346. //
  347. $this->tag_phrase($string, $start, $pos);
  348. ($string[$pos] == TP_GTXT_BRK) ? $this->in_get_text += 1 : $this->in_get_text -= 1;
  349. $pos++;
  350. $start = $pos;
  351. // reset state based on string start, no need to flip
  352. //$this->in_get_text = ($pos == 1);
  353. //if (!$this->in_get_text) $this->in_get_text_inner = false;
  354. } elseif ($string[$pos] == TP_GTXT_IBRK || $string[$pos] == TP_GTXT_IBRK_CLOSER) {
  355. // $logstr = str_replace(array(chr(1),chr(2),chr(3),chr(4)), array('[1]','[2]','[3]','[4]'), $string);
  356. // $closers = ($string[$pos] == TP_GTXT_IBRK) ? '': 'closer';
  357. //
  358. //
  359. $this->tag_phrase($string, $start, $pos);
  360. if ($this->in_get_text)
  361. ($string[$pos] == TP_GTXT_IBRK) ? $this->in_get_text_inner += 1 : $this->in_get_text_inner -=1;
  362. $pos++;
  363. $start = $pos;
  364. //$this->in_get_text_inner = !$this->in_get_text_inner;
  365. }
  366. // will break translation unit when there's a breaker ",.[]()..."
  367. elseif (PUNCT_BREAKS && $senb_len = $this->is_sentence_breaker($string[$pos], @$string[$pos + 1], @$string[$pos + 2])) {
  368. //
  369. $this->tag_phrase($string, $start, $pos);
  370. $pos += $senb_len;
  371. $start = $pos;
  372. }
  373. // Numbers also break, if they are followed by whitespace (or a sentence breaker) (don't break 42nd) // TODO: probably by breaking entities too...
  374. // also prefixed by whitespace?
  375. elseif (NUM_BREAKS && $num_len = $this->is_number($string, $pos)) {
  376. //
  377. // this is the case of B2 or B2,
  378. if (($start == $pos) || ($this->is_white_space($string[$pos - 1])
  379. || ($this->is_sentence_breaker(@$string[$pos + $num_len - 1], @$string[$pos + $num_len], @$string[$pos + $num_len + 1]))) &&
  380. ($this->is_white_space(@$string[$pos + $num_len]) || $this->is_sentence_breaker(@$string[$pos + $num_len], @$string[$pos + $num_len + 1], @$string[$pos + $num_len + 2]))) {
  381. // we will now compensate on the number followed by breaker case, if we need to
  382. //
  383. if (!(($start == $pos) || $this->is_white_space($string[$pos - 1]))) {
  384. //
  385. if ($this->is_sentence_breaker($string[$pos + $num_len - 1], $string[$pos + $num_len], $string[$pos + $num_len + 1])) {
  386. //
  387. $num_len--; //this makes the added number shorter by one, and the pos will be at a sentence breaker next so we don't have to compensate
  388. }
  389. $pos += $num_len;
  390. $num_len = 0; // we have already added this
  391. }
  392. $this->tag_phrase($string, $start, $pos);
  393. $start = $pos + $num_len /* +1 */;
  394. }
  395. $pos += $num_len/* + 1 */;
  396. //
  397. } else {
  398. // smarter marking of start location
  399. if ($start == $pos && $this->is_white_space($string[$pos]))
  400. $start++;
  401. $pos++;
  402. }
  403. }
  404. // the end is also some breaker
  405. if ($pos > $start) {
  406. $this->tag_phrase($string, $start, $pos);
  407. }
  408. }
  409. /**
  410. * This recursive function works on the $html dom and adds phrase nodes to translate as needed
  411. * it currently also rewrites urls, and should consider if this is smart
  412. * @param simple_html_dom_node $node
  413. */
  414. function translate_tagging($node, $level = 0) {
  415. $this->currentnode = $node;
  416. // we don't want to translate non-translatable classes
  417. if (stripos($node->class, NO_TRANSLATE_CLASS) !== false || stripos($node->class, NO_TRANSLATE_CLASS_GOOGLE) !== false)
  418. return;
  419. // the node lang is the current node lang or its parent lang
  420. if ($node->lang) {
  421. // allow nesting of srclang (again - local var)
  422. $prevsrclang = $this->srclang;
  423. $this->srclang = strtolower($node->lang);
  424. // using a local variable scope for later
  425. $src_set_here = true;
  426. // eliminate the lang tag from the html, since we aim to translate it
  427. unset($node->lang);
  428. }
  429. // we can only do translation for elements which are in the body, not in other places, and this must
  430. // move here due to the possibility of early recurse in default language
  431. if ($node->tag == 'body') {
  432. $this->inbody = true;
  433. }
  434. // this again should be here, the different behaviour on select and textarea
  435. // for now - we assume that they can't include each other
  436. elseif ($node->tag == 'select' || $node->tag == 'textarea' || $node->tag == 'noscript') {
  437. $this->inselect = true;
  438. $inselect_set_here = true;
  439. }
  440. //support only_thislanguage class, (nulling the node if it should not display)
  441. if (isset($src_set_here) && $src_set_here && $this->srclang != $this->lang && stripos($node->class, ONLY_THISLANGUAGE_CLASS) !== false) {
  442. $this->srclang = $prevsrclang; //we should return to the previous src lang or it will be kept and carried
  443. $node->outertext = '';
  444. return;
  445. }
  446. // if we are in the default lang, and we have no foreign langs classes, we'll recurse from here
  447. // we also avoid processing if the node lang is the target lang
  448. if (($this->default_lang && !$this->srclang) || ($this->srclang === $this->lang)) {
  449. foreach ($node->nodes as $c) {
  450. $this->translate_tagging($c, $level + 1);
  451. }
  452. if (isset($src_set_here) && $src_set_here)
  453. $this->srclang = $prevsrclang;
  454. return;
  455. }
  456. if (isset($this->ignore_tags[$node->tag])) return;
  457. if ($node->tag == 'text') {
  458. // this prevents translation of a link that just surrounds its address
  459. if ($node->parent->tag == 'a' && $node->parent->href == $node->outertext) {
  460. return;
  461. }
  462. // link tags inners are to be ignored
  463. if ($node->parent->tag == 'link') {
  464. return;
  465. }
  466. if (trim($node->outertext)) {
  467. $this->parsetext($node->outertext);
  468. }
  469. }
  470. // for anchors we will rewrite urls if we can
  471. elseif ($node->tag == 'a') {
  472. array_push($this->atags, $node);
  473. }
  474. // same for options, although normally not required (ticket #34)
  475. elseif ($node->tag == 'option') {
  476. array_push($this->otags, $node);
  477. }
  478. // in submit type inputs, we want to translate the value
  479. elseif ($node->tag == 'input' && $node->type == 'submit') {
  480. $this->parsetext($node->value);
  481. }
  482. // for iframes we will rewrite urls if we can
  483. elseif ($node->tag == 'iframe') {
  484. if ($this->url_rewrite_func) {
  485. $node->src = call_user_func_array($this->url_rewrite_func, array($node->src));
  486. }
  487. }
  488. // titles are also good places to translate, exist in a, img, abbr, acronym
  489. if ($node->title) $this->parsetext($node->title);
  490. // Meta content (keywords, description) are also good places to translate (but not in robots... or http-equiv)
  491. if ($node->tag == 'meta' && $node->content && ($node->name != 'robots') && ($node->name != 'viewport') && ($node->{'http-equiv'} != 'Content-Type'))
  492. $this->parsetext($node->content);
  493. // recurse
  494. foreach ($node->nodes as $c) {
  495. $this->translate_tagging($c, $level + 1);
  496. }
  497. if (isset($src_set_here) && $src_set_here)
  498. $this->srclang = $prevsrclang;
  499. if (isset($inselect_set_here) && $inselect_set_here)
  500. $this->inselect = false;
  501. }
  502. /**
  503. * Creates a span used in translation and editing
  504. * @param string $original_text
  505. * @param string $translated_text
  506. * @param int $source (Either "0" for Human, "1" for Machine or "" for untouched)
  507. * @param boolean $for_hidden_element
  508. * @param string $src_lang - if source lang of element is different that default (eg. wrapped in lang="xx" attr)
  509. * @return string
  510. */
  511. function create_edit_span($original_text, $translated_text, $source, $for_hidden_element = false, $src_lang = '') {
  512. // Use base64 encoding to make that when the page is translated (i.e. update_translation) we
  513. // get back exactlly the same string without having the client decode/encode it in anyway.
  514. $this->edit_span_created = true;
  515. $span = '<span class ="' . SPAN_PREFIX . '" id="' . SPAN_PREFIX . $this->span_id . '" data-token="' . transposh_utils::base64_url_encode($original_text) . '" data-source="' . $source . '"';
  516. // if we have a source language
  517. if ($src_lang) {
  518. $span .= ' data-srclang="' . $src_lang . '"';
  519. }
  520. // those are needed for on the fly image creation / hidden elements translations
  521. if ($this->is_edit_mode || $for_hidden_element) {
  522. $span .= ' data-orig="' . $original_text . '"';
  523. if ($for_hidden_element) {
  524. $span.= ' data-hidden="y"';
  525. // hidden elements currently have issues figuring what they translated in the JS
  526. if ($translated_text != null) {
  527. $span.= ' data-trans="' . $translated_text . '"';
  528. }
  529. }
  530. }
  531. $span .= '>';
  532. if (!$for_hidden_element) {
  533. if ($translated_text) $span .= $translated_text;
  534. else $span .= $original_text;
  535. }
  536. $span .= '</span>';
  537. ++$this->span_id;
  538. return $span;
  539. }
  540. /**
  541. * This function does some ad replacement for transposh benefit
  542. */
  543. function do_ad_switch() {
  544. if (isset($this->html->noise) && is_array($this->html->noise)) {
  545. foreach ($this->html->noise as $key => $value) {
  546. if (strpos($value, 'google_ad_client') !== false) {
  547. $publoc = strpos($value, 'pub-');
  548. $sufloc = strpos($value, '"', $publoc);
  549. if (!$sufloc) $sufloc = strpos($value, "'", $publoc);
  550. echo $publoc . ' ' . $sufloc;
  551. if ($publoc && $sufloc)
  552. $this->html->noise[$key] = substr($value, 0, $publoc) . 'pub-7523823497771676' . substr($value, $sufloc);
  553. }
  554. }
  555. }
  556. }
  557. /**
  558. * Main function - actually translates a given HTML
  559. * @param string $string containing HTML
  560. * @return string Translated content is here
  561. */
  562. function fix_html($string) {
  563. // ready our stats
  564. $this->stats = new parserstats();
  565. // handler for possible json (buddypress)
  566. if ($this->might_json) {
  567. if ($string[0] == '{') {
  568. $jsoner = json_decode($string);
  569. if ($jsoner != null) {
  570. // currently we only handle contents (which buddypress heavily use)
  571. if ($jsoner->contents) {
  572. $jsoner->contents = $this->fix_html($jsoner->contents);
  573. return json_encode($jsoner);
  574. }
  575. }
  576. }
  577. }
  578. // create our dom
  579. $this->html = str_get_html($string);
  580. // mark translateable elements
  581. $this->html->find('html', 0)->lang = ''; // Document defined lang may be preset to correct lang, but should be ignored TODO: Better?
  582. $this->translate_tagging($this->html->root);
  583. // first fix the html tag itself - we might need to to the same for all such attributes with flipping
  584. if ($this->dir_rtl) $this->html->find('html', 0)->dir = 'rtl';
  585. else $this->html->find('html', 0)->dir = 'ltr';
  586. if ($this->lang) {
  587. $this->html->find('html', 0)->lang = $this->lang;
  588. // add support for <meta name="language" content="<lang>">
  589. if ($this->html->find('meta[name=language]')) {
  590. $this->html->find('meta[name=language]')->content = $this->lang;
  591. }
  592. }
  593. // not much point in further processing if we don't have a function that does it
  594. if ($this->fetch_translate_func == null) {
  595. return $this->html;
  596. }
  597. // fix feed
  598. if ($this->feed_fix) {
  599. // fix urls on feed
  600. foreach (array('link', 'wfw:commentrss', 'comments') as $tag) {
  601. foreach ($this->html->find($tag) as $e) {
  602. $e->innertext = htmlspecialchars(call_user_func_array($this->url_rewrite_func, array($e->innertext)));
  603. // no need to translate anything here
  604. unset($e->nodes);
  605. }
  606. }
  607. // guid is not really a url -- in some future, we can check if permalink is true and probably falsify it
  608. foreach ($this->html->find('guid') as $e) {
  609. $e->innertext = $e->innertext . '-' . $this->lang;
  610. unset($e->nodes);
  611. }
  612. // fix feed language
  613. $this->html->find('language', 0)->innertext = $this->lang;
  614. unset($this->html->find('language', 0)->nodes);
  615. } else {
  616. // since this is not a feed, we might have references to such in the <link rel="alternate">
  617. foreach ($this->html->find('link') as $e) {
  618. if (strcasecmp($e->rel, 'alternate') == 0 || strcasecmp($e->rel, 'canonical') == 0) {
  619. $e->href = call_user_func_array($this->url_rewrite_func, array($e->href));
  620. }
  621. }
  622. }
  623. // try some prefetching... (//todo - maybe move directly to the phrase create)
  624. $originals = array();
  625. if ($this->prefetch_translate_func != null) {
  626. foreach ($this->html->find('text') as $e) {
  627. foreach ($e->nodes as $ep) {
  628. if ($ep->phrase) $originals[$ep->phrase] = true;
  629. }
  630. }
  631. foreach (array('title', 'value') as $title) {
  632. foreach ($this->html->find('[' . $title . ']') as $e) {
  633. if (isset($e->nodes)) foreach ($e->nodes as $ep) {
  634. if ($ep->phrase) $originals[$ep->phrase] = true;
  635. }
  636. }
  637. }
  638. foreach ($this->html->find('[content]') as $e) {
  639. foreach ($e->nodes as $ep) {
  640. if ($ep->phrase) $originals[$ep->phrase] = true;
  641. }
  642. }
  643. // if we should split, we will split some urls for translation prefetching
  644. if ($this->split_url_func != null) {
  645. foreach ($this->atags as $e) {
  646. foreach (call_user_func_array($this->split_url_func, array($e->href)) as $part) {
  647. $originals[$part] = true;
  648. }
  649. }
  650. foreach ($this->otags as $e) {
  651. foreach (call_user_func_array($this->split_url_func, array($e->value)) as $part) {
  652. $originals[$part] = true;
  653. }
  654. }
  655. }
  656. call_user_func_array($this->prefetch_translate_func, array($originals, $this->lang));
  657. }
  658. //fix urls more
  659. // WORK IN PROGRESS
  660. /* foreach ($this->atags as $e) {
  661. $hrefspans = '';
  662. foreach (call_user_func_array($this->split_url_func, array($e->href)) as $part) {
  663. // fix - not for dashes
  664. list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($part, $this->lang));
  665. $hrefspans .= $this->create_edit_span($part, $translated_text, $source, true);
  666. }
  667. $e->href = call_user_func_array($this->url_rewrite_func, array($e->href));
  668. $e->outertext .= $hrefspans;
  669. } */
  670. // fix urls...
  671. foreach ($this->atags as $e) {
  672. if ($e->href)
  673. $e->href = call_user_func_array($this->url_rewrite_func, array($e->href));
  674. }
  675. foreach ($this->otags as $e) {
  676. if ($e->value)
  677. $e->value = call_user_func_array($this->url_rewrite_func, array($e->value));
  678. }
  679. // this is used to reserve spans we cannot add directly (out of body, metas, etc)
  680. $hiddenspans = '';
  681. $savedspan = '';
  682. // actually translate tags
  683. // texts are first
  684. foreach ($this->html->find('text') as $e) {
  685. $replace = array();
  686. foreach ($e->nodes as $ep) {
  687. list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($ep->phrase, $this->lang));
  688. //stats
  689. $this->stats->total_phrases++;
  690. if ($translated_text) {
  691. $this->stats->translated_phrases++;
  692. if ($source == 0) $this->stats->human_translated_phrases++;
  693. }
  694. if (($this->is_edit_mode || ($this->is_auto_translate && $translated_text == null))/* && $ep->inbody */) {
  695. if ($ep->inselect) {
  696. $savedspan .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
  697. } elseif (!$ep->inbody) {
  698. $hiddenspans .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
  699. } else {
  700. $translated_text = $this->create_edit_span($ep->phrase, $translated_text, $source, false, $ep->srclang);
  701. }
  702. }
  703. // store replacements
  704. if ($translated_text) {
  705. $replace[$translated_text] = $ep;
  706. }
  707. }
  708. // do replacements in reverse
  709. foreach (array_reverse($replace, true) as $replace => $epg) {
  710. $e->outertext = substr_replace($e->outertext, $replace, $epg->start, $epg->len);
  711. }
  712. // this adds saved spans to the first not in select element which is in the body
  713. if (!$ep->inselect && $savedspan && $ep->inbody) { // (TODO: might not be...?)
  714. $e->outertext = $savedspan . $e->outertext;
  715. $savedspan = '';
  716. }
  717. }
  718. // now we handle the title attributes (and the value of submit buttons)
  719. $hidden_phrases = array();
  720. foreach (array('title', 'value') as $title) {
  721. foreach ($this->html->find('[' . $title . ']') as $e) {
  722. $replace = array();
  723. $span = '';
  724. // when we already have a parent outertext we'll have to update it directly
  725. if (isset($e->parent->_[HDOM_INFO_OUTER])) {
  726. $saved_outertext = $e->outertext;
  727. }
  728. if (isset($e->nodes)) foreach ($e->nodes as $ep) {
  729. if ($ep->tag == 'phrase') {
  730. list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($ep->phrase, $this->lang));
  731. // more stats
  732. $this->stats->total_phrases++;
  733. if ($ep->inbody) $this->stats->hidden_phrases++; else
  734. $this->stats->meta_phrases++;
  735. if ($translated_text) {
  736. $this->stats->translated_phrases++;
  737. if ($ep->inbody)
  738. $this->stats->hidden_translated_phrases++; else
  739. $this->stats->meta_translated_phrases++;
  740. if ($source == 0)
  741. $this->stats->human_translated_phrases++;
  742. }
  743. if (($this->is_edit_mode || ($this->is_auto_translate && $translated_text == null)) && $ep->inbody) {
  744. // prevent duplicate translation (title = text)
  745. if (strpos($e->innertext, transposh_utils::base64_url_encode($ep->phrase)) === false) {
  746. //no need to translate span the same hidden phrase more than once
  747. if (!in_array($ep->phrase, $hidden_phrases)) {
  748. $this->stats->hidden_translateable_phrases++;
  749. $span .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
  750. //
  751. $hidden_phrases[] = $ep->phrase;
  752. }
  753. }
  754. }
  755. // if we need to replace, we store this
  756. if ($translated_text) {
  757. $replace[$translated_text] = $ep;
  758. }
  759. }
  760. }
  761. // and later replace
  762. foreach (array_reverse($replace, true) as $replace => $epg) {
  763. $e->title = substr_replace($e->title, $replace, $epg->start, $epg->len);
  764. }
  765. $e->outertext .= $span;
  766. // this is where we update in the outercase issue
  767. if (isset($e->parent->_[HDOM_INFO_OUTER])) {
  768. $e->parent->outertext = implode($e->outertext, explode($saved_outertext, $e->parent->outertext, 2));
  769. }
  770. }
  771. }
  772. // now we handle the meta content - which is simpler because they can't be edited or auto-translated in place
  773. // we also don't expect any father modifications here
  774. // so we now add all those spans right before the <body> tag end
  775. foreach ($this->html->find('[content]') as $e) {
  776. $right = '';
  777. $newtext = '';
  778. foreach ($e->nodes as $ep) {
  779. if ($ep->tag == 'phrase') {
  780. // even more stats
  781. $this->stats->total_phrases++;
  782. $this->stats->meta_phrases++;
  783. list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($ep->phrase, $this->lang));
  784. if ($translated_text) {
  785. $this->stats->translated_phrases++;
  786. $this->stats->meta_translated_phrases++;
  787. if ($source == 0)
  788. $this->stats->human_translated_phrases++;
  789. list ($left, $right) = explode($ep->phrase, $e->content, 2);
  790. $newtext .= $left . $translated_text;
  791. $e->content = $right;
  792. }
  793. if ($this->is_edit_mode) {
  794. $hiddenspans .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
  795. }
  796. if (!$translated_text && $this->is_auto_translate && !$this->is_edit_mode) {
  797. if ($this->is_edit_mode || $this->is_auto_translate) { // FIX
  798. }
  799. }
  800. }
  801. }
  802. if ($newtext) {
  803. $e->content = $newtext . $right;
  804. }
  805. }
  806. if ($hiddenspans) {
  807. $body = $this->html->find('body', 0);
  808. if ($body != null) $body->lastChild()->outertext .= $hiddenspans;
  809. }
  810. // we might show an ad for transposh in some cases
  811. if (($this->allow_ad && !$this->default_lang && mt_rand(1, 100) > 95) || // 5 of 100 for translated non default language pages
  812. ($this->allow_ad && $this->default_lang && mt_rand(1, 100) > 99) || // 1 of 100 for translated default languages pages
  813. (!$this->allow_ad && mt_rand(1, 1000) > 999)) { // 1 of 1000 otherwise
  814. $this->do_ad_switch();
  815. }
  816. // This adds a meta tag with our statistics json-encoded inside...
  817. $this->stats->stop_timing();
  818. $head = $this->html->find('head', 0);
  819. if ($this->edit_span_created) {
  820. if ($head != null) {
  821. $head->lastChild()->outertext .= $this->added_header;
  822. }
  823. }
  824. //exit;
  825. if ($head != null)
  826. $head->lastChild()->outertext .= "\n<meta name=\"translation-stats\" content='" . json_encode($this->stats) . "'/>";
  827. // we make sure that the result is clear from our shananigans
  828. return str_replace(array(TP_GTXT_BRK, TP_GTXT_IBRK, TP_GTXT_BRK_CLOSER, TP_GTXT_IBRK_CLOSER), '', $this->html->outertext);
  829. // Changed because of places where tostring failed
  830. //return $this->html;
  831. //return $this->html->outertext;
  832. }
  833. /**
  834. * This functions returns a list of phrases from a given HTML string
  835. * @param string $string Html with phrases to extract
  836. * @return array List of phrases (or an empty one)
  837. * @since 0.3.5
  838. */
  839. function get_phrases_list($string) {
  840. $result = array();
  841. // create our dom
  842. $this->html = str_get_html('<span lang="xx">' . $string . '<span>');
  843. // mark translateable elements
  844. $this->translate_tagging($this->html->root);
  845. foreach ($this->html->nodes as $ep) {
  846. if ($ep->tag == 'phrase') {
  847. $result[$ep->phrase] = $ep->phrase;
  848. }
  849. }
  850. return $result;
  851. }
  852. }
  853. ?>