PageRenderTime 27ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/extensions/Translate/ffs/Gettext.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 562 lines | 419 code | 76 blank | 67 comment | 60 complexity | ff036645fa6ae0025ef25d63247705aa MD5 | raw file
  1. <?php
  2. /**
  3. * Gettext file format handler for both old and new style message groups.
  4. *
  5. * @author Niklas Laxström
  6. * @author Siebrand Mazeland
  7. * @copyright Copyright © 2008-2010, Niklas Laxström, Siebrand Mazeland
  8. * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
  9. * @file
  10. */
  11. /**
  12. * Identifies Gettext plural exceptions.
  13. */
  14. class GettextPluralException extends MwException {}
  15. /**
  16. * New-style FFS class that implements support for gettext file format.
  17. * @ingroup FFS
  18. */
  19. class GettextFFS extends SimpleFFS {
  20. protected $offlineMode = false;
  21. public function setOfflineMode( $value ) {
  22. $this->offlineMode = $value;
  23. }
  24. public function readFromVariable( $data ) {
  25. # Authors first
  26. $matches = array();
  27. preg_match_all( '/^#\s*Author:\s*(.*)$/m', $data, $matches );
  28. $authors = $matches[1];
  29. # Then messages and everything else
  30. $parsedData = $this->parseGettext( $data );
  31. $parsedData['AUTHORS'] = $authors;
  32. foreach ( $parsedData['MESSAGES'] as $key => $value ) {
  33. if ( $value === '' ) unset( $parsedData['MESSAGES'][$key] );
  34. }
  35. return $parsedData;
  36. }
  37. public function parseGettext( $data ) {
  38. $mangler = $this->group->getMangler();
  39. $useCtxtAsKey = isset( $this->extra['CtxtAsKey'] ) && $this->extra['CtxtAsKey'];
  40. return self::parseGettextData( $data, $useCtxtAsKey, $mangler );
  41. }
  42. /**
  43. * Parses gettext data into internal representation.
  44. * @param $data \string
  45. * @param $useCtxtAsKey \bool Whether to create message keys from the context
  46. * or use msgctxt (non-standard po-files)
  47. * @param $mangler StringMangler
  48. * @return \array
  49. * @todo Refactor method into smaller parts.
  50. */
  51. public static function parseGettextData( $data, $useCtxtAsKey = false, $mangler ) {
  52. $potmode = false;
  53. // Normalise newlines, to make processing easier lates
  54. $data = str_replace( "\r\n", "\n", $data );
  55. /* Delimit the file into sections, which are separated by two newlines.
  56. * We are permissive and accept more than two. This parsing method isn't
  57. * efficient wrt memory, but was easy to implement */
  58. $sections = preg_split( '/\n{2,}/', $data );
  59. /* First one isn't an actual message. We'll handle it specially below */
  60. $headerSection = array_shift( $sections );
  61. /* Since this is the header section, we are only interested in the tags
  62. * and msgid is empty. Somewhere we should extract the header comments
  63. * too */
  64. $match = self::expectKeyword( 'msgstr', $headerSection );
  65. if ( $match !== null ) {
  66. $headerBlock = self::formatForWiki( $match, 'trim' );
  67. $headers = self::parseHeaderTags( $headerBlock );
  68. // Check for pot-mode by checking if the header is fuzzy
  69. $flags = self::parseFlags( $headerSection );
  70. if ( in_array( 'fuzzy', $flags, true ) ) $potmode = true;
  71. } else {
  72. throw new MWException( "Gettext file header was not found:\n\n$data" );
  73. }
  74. // Extract some metadata from headers for easier use
  75. $metadata = array();
  76. if ( isset( $headers['X-Language-Code'] ) ) {
  77. $metadata['code'] = $headers['X-Language-Code'];
  78. }
  79. if ( isset( $headers['X-Message-Group'] ) ) {
  80. $metadata['group'] = $headers['X-Message-Group'];
  81. }
  82. /* At this stage we are only interested how many plurals forms we should
  83. * be expecting when parsing the rest of this file. */
  84. $pluralCount = false;
  85. if ( isset( $headers['Plural-Forms'] ) ) {
  86. if ( preg_match( '/nplurals=([0-9]+).*;/', $headers['Plural-Forms'], $matches ) ) {
  87. $pluralCount = $metadata['plural'] = $matches[1];
  88. }
  89. }
  90. // Then parse the messages
  91. foreach ( $sections as $section ) {
  92. if ( trim( $section ) === '' ) {
  93. continue;
  94. }
  95. /* These inactive section are of no interest to us. Multiline mode
  96. * is needed because there may be flags or other annoying stuff
  97. * before the commented out sections.
  98. */
  99. if ( preg_match( '/^#~/m', $section ) ) continue;
  100. $item = array(
  101. 'ctxt' => '',
  102. 'id' => '',
  103. 'str' => '',
  104. 'flags' => array(),
  105. 'comments' => array(),
  106. );
  107. $match = self::expectKeyword( 'msgid', $section );
  108. if ( $match !== null ) {
  109. $item['id'] = self::formatForWiki( $match );
  110. } else {
  111. throw new MWException( "Unable to parse msgid:\n\n$section" );
  112. }
  113. $match = self::expectKeyword( 'msgctxt', $section );
  114. if ( $match !== null ) {
  115. $item['ctxt'] = self::formatForWiki( $match );
  116. } elseif ( $useCtxtAsKey ) { // Invalid message
  117. $metadata['warnings'][] = "Ctxt missing for {$item['id']}";
  118. error_log( "Ctxt missing for {$item['id']}" );
  119. }
  120. $pluralMessage = false;
  121. $match = self::expectKeyword( 'msgid_plural', $section );
  122. if ( $match !== null ) {
  123. $pluralMessage = true;
  124. $plural = self::formatForWiki( $match );
  125. $item['id'] = "{{PLURAL:GETTEXT|{$item['id']}|$plural}}";
  126. }
  127. if ( $pluralMessage ) {
  128. $actualForms = array();
  129. for ( $i = 0; $i < $pluralCount; $i++ ) {
  130. $match = self::expectKeyword( "msgstr\\[$i\\]", $section );
  131. if ( $match !== null ) {
  132. $actualForms[] = self::formatForWiki( $match );
  133. } else {
  134. $actualForms[] = '';
  135. error_log( "Plural $i not found, expecting total of $pluralCount for {$item['id']}" );
  136. }
  137. }
  138. // Keep the translation empty if no form has translation
  139. if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) {
  140. $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}';
  141. }
  142. } else {
  143. $match = self::expectKeyword( 'msgstr', $section );
  144. if ( $match !== null ) {
  145. $item['str'] = self::formatForWiki( $match );
  146. } else {
  147. throw new MWException( "Unable to parse msgstr:\n\n$section" );
  148. }
  149. }
  150. // Parse flags
  151. $flags = self::parseFlags( $section );
  152. foreach ( $flags as $key => $flag ) {
  153. if ( $flag === 'fuzzy' ) {
  154. $item['str'] = TRANSLATE_FUZZY . $item['str'];
  155. unset( $flags[$key] );
  156. }
  157. }
  158. $item['flags'] = $flags;
  159. // Rest of the comments
  160. $matches = array();
  161. if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) {
  162. foreach ( $matches as $match ) {
  163. if ( $match[1] !== ',' && strpos( $match[1], '[Wiki]' ) !== 0 ) {
  164. $item['comments'][$match[1]][] = $match[2];
  165. }
  166. }
  167. }
  168. if ( $useCtxtAsKey ) {
  169. $key = $item['ctxt'];
  170. } else {
  171. $key = self::generateKeyFromItem( $item );
  172. }
  173. $key = $mangler->mangle( $key );
  174. $messages[$key] = $potmode ? $item['id'] : $item['str'];
  175. $template[$key] = $item;
  176. }
  177. return array(
  178. 'MESSAGES' => $messages,
  179. 'TEMPLATE' => $template,
  180. 'METADATA' => $metadata,
  181. 'HEADERS' => $headers
  182. );
  183. }
  184. public static function parseFlags( $section ) {
  185. $matches = array();
  186. if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) {
  187. return array_map( 'trim', explode( ',', $matches[1] ) );
  188. } else {
  189. return array();
  190. }
  191. }
  192. public static function expectKeyword( $name, $section ) {
  193. /* Catches the multiline textblock that comes after keywords msgid,
  194. * msgstr, msgid_plural, msgctxt.
  195. */
  196. $poformat = '".*"\n?(^".*"$\n?)*';
  197. $matches = array();
  198. if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) {
  199. return $matches[1];
  200. } else {
  201. return null;
  202. }
  203. }
  204. /**
  205. * Generates unique key for each message. Changing this WILL BREAK ALL
  206. * existing pages!
  207. */
  208. public static function generateKeyFromItem( $item ) {
  209. $lang = Language::factory( 'en' );
  210. global $wgLegalTitleChars;
  211. $hash = sha1( $item['ctxt'] . $item['id'] );
  212. $snippet = $item['id'];
  213. $snippet = preg_replace( "/[^$wgLegalTitleChars]/", ' ', $snippet );
  214. $snippet = preg_replace( "/[:&%\/_]/", ' ', $snippet );
  215. $snippet = preg_replace( "/ {2,}/", ' ', $snippet );
  216. $snippet = $lang->truncate( $snippet, 30, '' );
  217. $snippet = str_replace( ' ', '_', trim( $snippet ) );
  218. return "$hash-$snippet";
  219. }
  220. /**
  221. * This parses the Gettext text block format. Since trailing whitespace is
  222. * not allowed in MediaWiki pages, the default action is to append
  223. * \-character at the end of the message. You can also choose to ignore it
  224. * and use the trim action instead.
  225. */
  226. public static function formatForWiki( $data, $whitespace = 'mark' ) {
  227. $quotePattern = '/(^"|"$\n?)/m';
  228. $data = preg_replace( $quotePattern, '', $data );
  229. $data = stripcslashes( $data );
  230. if ( preg_match( '/\s$/', $data ) ) {
  231. if ( $whitespace === 'mark' )
  232. $data .= '\\';
  233. elseif ( $whitespace === 'trim' )
  234. $data = rtrim( $data );
  235. else
  236. // @todo Only triggered if there is trailing whitespace
  237. throw new MWException( 'Unknown action for whitespace' );
  238. }
  239. return $data;
  240. }
  241. public static function parseHeaderTags( $headers ) {
  242. $tags = array();
  243. foreach ( explode( "\n", $headers ) as $line ) {
  244. if ( strpos( $line, ':' ) === false ) {
  245. error_log( __METHOD__ . ": $line" );
  246. }
  247. list( $key, $value ) = explode( ':', $line, 2 );
  248. $tags[trim( $key )] = trim( $value );
  249. }
  250. return $tags;
  251. }
  252. protected function writeReal( MessageCollection $collection ) {
  253. $pot = $this->read( 'en' );
  254. $template = $this->read( $collection->code );
  255. $pluralCount = false;
  256. $output = $this->doGettextHeader( $collection, $template, $pluralCount );
  257. foreach ( $collection as $key => $m ) {
  258. $transTemplate = isset( $template['TEMPLATE'][$key] ) ?
  259. $template['TEMPLATE'][$key] : array();
  260. $potTemplate = isset( $pot['TEMPLATE'][$key] ) ?
  261. $pot['TEMPLATE'][$key] : array();
  262. $output .= $this->formatMessageBlock( $key, $m, $transTemplate, $potTemplate, $pluralCount );
  263. }
  264. return $output;
  265. }
  266. protected function doGettextHeader( MessageCollection $collection, $template, &$pluralCount ) {
  267. global $wgSitename, $wgCanonicalServer;
  268. $code = $collection->code;
  269. $name = TranslateUtils::getLanguageName( $code );
  270. $native = TranslateUtils::getLanguageName( $code, true );
  271. $authors = $this->doAuthors( $collection );
  272. if ( isset( $this->extra['header'] ) ) {
  273. $extra = "# --\n" . $this->extra['header'];
  274. } else {
  275. $extra = '';
  276. }
  277. $output = <<<PHP
  278. # Translation of {$this->group->getLabel()} to $name ($native)
  279. # Exported from $wgSitename
  280. #
  281. $authors$extra
  282. PHP;
  283. // Make sure there is no empty line before msgid
  284. $output = trim( $output ) . "\n";
  285. // @todo twn specific
  286. $portal = Title::makeTitle( NS_PORTAL, $code )->getCanonicalUrl();
  287. $specs = isset( $template['HEADERS'] ) ? $template['HEADERS'] : array();
  288. $timestamp = wfTimestampNow();
  289. $specs['Project-Id-Version'] = $this->group->getLabel();
  290. $specs['Report-Msgid-Bugs-To'] = $wgSitename;
  291. $specs['PO-Revision-Date'] = self::formatTime( $timestamp );
  292. if ( $this->offlineMode ) {
  293. $specs['POT-Creation-Date'] = self::formatTime( $timestamp );
  294. } elseif ( $this->group instanceof MessageGroupBase ) {
  295. $specs['X-POT-Import-Date'] = self::formatTime( wfTimestamp( TS_MW, $this->getPotTime() ) );
  296. }
  297. $specs['Language-Team'] = "$name <$portal>";
  298. $specs['Content-Type'] = 'text/plain; charset=UTF-8';
  299. $specs['Content-Transfer-Encoding'] = '8bit';
  300. $specs['X-Generator'] = $this->getGenerator();
  301. $specs['X-Translation-Project'] = "$wgSitename at $wgCanonicalServer";
  302. $specs['X-Language-Code'] = $code;
  303. if ( $this->offlineMode ) {
  304. $specs['X-Message-Group'] = $this->group->getId();
  305. } else {
  306. // Prepend # so that message import does not think this is a file it can import
  307. $specs['X-Message-Group'] = '#' . $this->group->getId();
  308. }
  309. $plural = self::getPluralRule( $code );
  310. if ( $plural ) {
  311. $specs['Plural-Forms'] = $plural;
  312. } elseif ( !isset( $specs['Plural-Forms'] ) ) {
  313. $specs['Plural-Forms'] = 'nplurals=2; plural=(n != 1);';
  314. }
  315. $match = array();
  316. preg_match( '/nplurals=(\d+);/', $specs['Plural-Forms'], $match );
  317. $pluralCount = $match[1];
  318. $output .= 'msgid ""' . "\n";
  319. $output .= 'msgstr ""' . "\n";
  320. $output .= '""' . "\n";
  321. foreach ( $specs as $k => $v ) {
  322. $output .= self::escape( "$k: $v\n" ) . "\n";
  323. }
  324. $output .= "\n";
  325. return $output;
  326. }
  327. protected function doAuthors( MessageCollection $collection ) {
  328. $output = '';
  329. $authors = $collection->getAuthors();
  330. $authors = $this->filterAuthors( $authors, $collection->code );
  331. foreach ( $authors as $author ) {
  332. $output .= "# Author: $author\n";
  333. }
  334. return $output;
  335. }
  336. protected function formatMessageBlock( $key, $m, $trans, $pot, $pluralCount ) {
  337. $header = $this->formatDocumentation( $key );
  338. $content = '';
  339. $comments = self::chainGetter( 'comments', $pot, $trans, array() );
  340. foreach ( $comments as $type => $typecomments ) {
  341. foreach ( $typecomments as $comment ) {
  342. $header .= "#$type $comment\n";
  343. }
  344. }
  345. $flags = self::chainGetter( 'flags', $pot, $trans, array() );
  346. $flags = array_merge( $m->getTags(), $flags );
  347. if ( $this->offlineMode ) {
  348. $content .= 'msgctxt ' . self::escape( $key ) . "\n";
  349. } else {
  350. $ctxt = self::chainGetter( 'ctxt', $pot, $trans, false );
  351. if ( $ctxt ) {
  352. $content .= 'msgctxt ' . self::escape( $ctxt ) . "\n";
  353. }
  354. }
  355. $msgid = $m->definition();
  356. $msgstr = $m->translation();
  357. if ( strpos( $msgstr, TRANSLATE_FUZZY ) !== false ) {
  358. $msgstr = str_replace( TRANSLATE_FUZZY, '', $msgstr );
  359. // Might by fuzzy infile
  360. $flags[] = 'fuzzy';
  361. }
  362. if ( preg_match( '/{{PLURAL:GETTEXT/i', $msgid ) ) {
  363. $forms = $this->splitPlural( $msgid, 2 );
  364. $content .= 'msgid ' . $this->escape( $forms[0] ) . "\n";
  365. $content .= 'msgid_plural ' . $this->escape( $forms[1] ) . "\n";
  366. try {
  367. $forms = $this->splitPlural( $msgstr, $pluralCount );
  368. foreach ( $forms as $index => $form ) {
  369. $content .= "msgstr[$index] " . $this->escape( $form ) . "\n";
  370. }
  371. } catch ( GettextPluralException $e ) {
  372. $flags[] = 'invalid-plural';
  373. for ( $i = 0; $i < $pluralCount; $i++ ) {
  374. $content .= "msgstr[$i] \"\"\n";
  375. }
  376. }
  377. } else {
  378. $content .= 'msgid ' . self::escape( $msgid ) . "\n";
  379. $content .= 'msgstr ' . self::escape( $msgstr ) . "\n";
  380. }
  381. if ( $flags ) {
  382. sort( $flags );
  383. $header .= "#, " . implode( ', ', array_unique( $flags ) ) . "\n";
  384. }
  385. $output = $header ? $header : "#\n";
  386. $output .= $content . "\n";
  387. return $output;
  388. }
  389. protected static function chainGetter( $key, $a, $b, $default ) {
  390. if ( isset( $a[$key] ) ) {
  391. return $a[$key];
  392. } elseif ( isset( $b[$key] ) ) {
  393. return $b[$key];
  394. } else {
  395. return $default;
  396. }
  397. }
  398. protected static function formatTime( $time ) {
  399. $lang = Language::factory( 'en' );
  400. return $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $time );
  401. }
  402. protected function getPotTime() {
  403. $defs = new MessageGroupCache( $this->group );
  404. return $defs->exists() ? $defs->getTimestamp() : wfTimestampNow();
  405. }
  406. protected function getGenerator() {
  407. return 'MediaWiki ' . SpecialVersion::getVersion() .
  408. "; Translate extension (" . TRANSLATE_VERSION . ")";
  409. }
  410. protected function formatDocumentation( $key ) {
  411. global $wgTranslateDocumentationLanguageCode;
  412. if ( !$this->offlineMode ) return '';
  413. $code = $wgTranslateDocumentationLanguageCode;
  414. if ( !$code ) return '';
  415. $documentation = TranslateUtils::getMessageContent( $key, $code, $this->group->getNamespace() );
  416. if ( !is_string( $documentation ) ) return '';
  417. $lines = explode( "\n", $documentation );
  418. $out = '';
  419. foreach ( $lines as $line ) {
  420. $out .= "#. [Wiki] $line\n";
  421. }
  422. return $out;
  423. }
  424. protected static function escape( $line ) {
  425. // There may be \ as a last character, for keeping trailing whitespace
  426. $line = preg_replace( '/\\\\$/', '', $line );
  427. $line = addcslashes( $line, '\\"' );
  428. $line = str_replace( "\n", '\n', $line );
  429. $line = '"' . $line . '"';
  430. return $line;
  431. }
  432. /**
  433. * Returns plural rule for Gettext.
  434. * @param $code \string Language code.
  435. * @return \string
  436. */
  437. public static function getPluralRule( $code ) {
  438. $rulefile = dirname( __FILE__ ) . '/../data/plural-gettext.txt';
  439. $rules = file_get_contents( $rulefile );
  440. foreach ( explode( "\n", $rules ) as $line ) {
  441. if ( trim( $line ) === '' ) continue;
  442. list( $rulecode, $rule ) = explode( "\t", $line );
  443. if ( $rulecode === $code ) return $rule;
  444. }
  445. return '';
  446. }
  447. protected function splitPlural( $text, $forms ) {
  448. if ( $forms === 1 ) {
  449. return $text;
  450. }
  451. $splitPlurals = array();
  452. for ( $i = 0; $i < $forms; $i++ ) {
  453. $plurals = array();
  454. $match = preg_match_all( '/{{PLURAL:GETTEXT\|(.*)}}/iU', $text, $plurals );
  455. if ( !$match ) {
  456. throw new GettextPluralException( "Failed to parse plural for: $text" );
  457. }
  458. $pluralForm = $text;
  459. foreach ( $plurals[0] as $index => $definition ) {
  460. $parsedFormsArray = explode( '|', $plurals[1][$index] );
  461. if ( !isset( $parsedFormsArray[$i] ) ) {
  462. error_log( "Too few plural forms in: $text" );
  463. $pluralForm = '';
  464. } else {
  465. $pluralForm = str_replace( $pluralForm, $definition, $parsedFormsArray[$i] );
  466. }
  467. }
  468. $splitPlurals[$i] = $pluralForm;
  469. }
  470. return $splitPlurals;
  471. }
  472. }