/atlas/engine/ezc/Document/src/document/wiki/tokenizer/dokuwiki.php
PHP | 286 lines | 183 code | 19 blank | 84 comment | 6 complexity | 300d80125270fbae4905ea0831c83d81 MD5 | raw file
Possible License(s): BSD-3-Clause
- <?php
- /**
- * File containing the ezcDocumentWikiDokuwikiTokenizer
- *
- * @package Document
- * @version 1.1.2
- * @copyright Copyright (C) 2005-2009 eZ Systems AS. All rights reserved.
- * @license http://ez.no/licenses/new_bsd New BSD License
- */
- /**
- * Tokenizer for Dokuwiki wiki documents.
- *
- * The Dokuwiki wiki is a very popular wiki, which for example is currently
- * used at http://wiki.php.net. The Dokuwiki syntax definition can be found at:
- *
- * http://www.dokuwiki.org/syntax
- *
- * For the basic workings of the tokenizer see the class level documentation in
- * the ezcDocumentWikiTokenizer class.
- *
- * @package Document
- * @version 1.1.2
- */
- class ezcDocumentWikiDokuwikiTokenizer extends ezcDocumentWikiTokenizer
- {
- /**
- * Common whitespace characters. The vertical tab is excluded, because it
- * causes strange problems with PCRE.
- */
- const WHITESPACE_CHARS = '[\\x20\\t]';
- /**
- * Characters ending a pure text section.
- */
- const TEXT_END_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=\\r\\n\\t\\x20';
- /**
- * Special characters, which do have some special meaaning and though may
- * not have been matched otherwise.
- */
- const SPECIAL_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=';
- /**
- * Construct tokenizer
- *
- * Create token array with regular repression matching the respective
- * token.
- *
- * @return void
- */
- public function __construct()
- {
- $this->tokens = array(
- // Match tokens which require to be at the start of a line before
- // matching the actual newlines, because they are the indicator for
- // line starts.
- array(
- 'class' => 'ezcDocumentWikiTitleToken',
- 'match' => '(\\A(?P<match>(?:\\n|' . self::WHITESPACE_CHARS . '+)(?P<value>={2,6}))(?:\\n|' . self::WHITESPACE_CHARS . '+))S' ),
- array(
- 'class' => 'ezcDocumentWikiBulletListItemToken',
- 'match' => '(\\A\\n(?P<value>\\x20*\\*)' . self::WHITESPACE_CHARS . '+)S' ),
- array(
- 'class' => 'ezcDocumentWikiEnumeratedListItemToken',
- 'match' => '(\\A\\n(?P<value>\\x20*-)' . self::WHITESPACE_CHARS . '+)S' ),
- array(
- 'class' => 'ezcDocumentWikiLiteralBlockToken',
- 'match' => '(\\A(?P<match>\\n<(code|file)>\\n(?P<value>.+)\\n</\\2>)\\n)SUsi' ),
- array(
- 'class' => 'ezcDocumentWikiLiteralBlockToken',
- 'match' => '(\\A(?P<match>\\n(?P<value>(' . self::WHITESPACE_CHARS . '+).*\n(?:\\3.*\n)*)))S' ),
- array(
- 'class' => 'ezcDocumentWikiTextLineToken',
- 'match' => '(\\A(?P<match>\\n<nowiki>\\n(?P<value>.+)\\n</nowiki>)\\n)SUsi' ),
- array(
- 'class' => 'ezcDocumentWikiTableRowToken',
- 'match' => '(\\A(?P<match>\\n)(?P<value>[|^]))S' ),
- array(
- 'class' => 'ezcDocumentWikiParagraphIndentationToken',
- 'match' => '(\\A\\n(?P<value>>+)' . self::WHITESPACE_CHARS . '*)S' ),
- // Whitespaces
- array(
- 'class' => 'ezcDocumentWikiNewLineToken',
- 'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
- array(
- 'class' => 'ezcDocumentWikiWhitespaceToken',
- 'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
- array(
- 'class' => 'ezcDocumentWikiEndOfFileToken',
- 'match' => '(\\A(?P<value>\\x0c))S' ),
- // Escape character
- /*
- array(
- 'class' => 'ezcDocumentWikiEscapeCharacterToken',
- 'match' => '(\\A(?P<value>~))S' ),
- // */
- // Inline markup
- array(
- 'class' => 'ezcDocumentWikiBoldToken',
- 'match' => '(\\A(?P<value>\\*\\*))S' ),
- array(
- 'class' => 'ezcDocumentWikiItalicToken',
- 'match' => '(\\A(?P<value>//))S' ),
- array(
- 'class' => 'ezcDocumentWikiMonospaceToken',
- 'match' => '(\\A(?P<value>\'\'))S' ),
- array(
- 'class' => 'ezcDocumentWikiSuperscriptToken',
- 'match' => '(\\A(?P<value></?sup>))Si' ),
- array(
- 'class' => 'ezcDocumentWikiSubscriptToken',
- 'match' => '(\\A(?P<value></?sub>))Si' ),
- array(
- 'class' => 'ezcDocumentWikiUnderlineToken',
- 'match' => '(\\A(?P<value>__))S' ),
- array(
- 'class' => 'ezcDocumentWikiDeletedToken',
- 'match' => '(\\A(?P<value></?del>))Si' ),
- array(
- 'class' => 'ezcDocumentWikiInlineLiteralToken',
- 'match' => '(\\A<nowiki>(?P<value>.*)</nowiki>)SUi' ),
- array(
- 'class' => 'ezcDocumentWikiTextLineToken',
- 'match' => '(\\A%%(?P<value>.*)%%)SUi' ),
- array(
- 'class' => 'ezcDocumentWikiLineBreakToken',
- 'match' => '(\\A(?P<match>(?P<value>\\\\\\\\))(?:' . self::WHITESPACE_CHARS . '|\\n))S' ),
- array(
- 'class' => 'ezcDocumentWikiLinkStartToken',
- 'match' => '(\\A(?P<value>\\[\\[))S' ),
- array(
- 'class' => 'ezcDocumentWikiLinkEndToken',
- 'match' => '(\\A(?P<value>\\]\\]))S' ),
- array(
- 'class' => 'ezcDocumentWikiSeparatorToken',
- 'match' => '(\\A(?P<value>\\||' . self::WHITESPACE_CHARS . '*->' . self::WHITESPACE_CHARS . '*))S' ),
- array(
- 'class' => 'ezcDocumentWikiExternalLinkToken',
- 'match' => '(\\A
- (?P<match>
- (?P<value>
- # Match common URLs
- [a-z]+://\S+? |
- # Match mail addresses enclosed by <>
- <[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?>
- )
- # Greedy match on text end chars, which should NOT be included in URLs
- )[,.?!:;"\']?(?:' . self::WHITESPACE_CHARS . '|\\n|\\||]]|\\}\\}|$)
- )Sx' ),
- array(
- 'class' => 'ezcDocumentWikiInterWikiLinkToken',
- 'match' => '(\\A(?P<value>([A-Za-z]+)>[^\\]|]+))S' ),
- array(
- 'class' => 'ezcDocumentWikiImageStartToken',
- 'match' => '(\\A(?P<value>\\{\\{))S' ),
- array(
- 'class' => 'ezcDocumentWikiImageEndToken',
- 'match' => '(\\A(?P<value>\\}\\}))S' ),
- array(
- 'class' => 'ezcDocumentWikiFootnoteStartToken',
- 'match' => '(\\A(?P<value>\\(\\())S' ),
- array(
- 'class' => 'ezcDocumentWikiFootnoteEndToken',
- 'match' => '(\\A(?P<value>\\)\\)))S' ),
- array(
- 'class' => 'ezcDocumentWikiTableHeaderToken',
- 'match' => '(\\A(?P<value>\\^))S' ),
- array(
- 'class' => 'ezcDocumentWikiPluginToken',
- 'match' => '(\\A(?P<value><([a-zA-Z]+).*?</\\2>))Ss' ),
- // Match text except
- array(
- 'class' => 'ezcDocumentWikiTextLineToken',
- 'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),
- // Match all special characters, which are not valid textual chars,
- // but do not have been matched by any other expression.
- array(
- 'class' => 'ezcDocumentWikiSpecialCharsToken',
- 'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
- );
- }
- /**
- * Parse plugin contents
- *
- * Plugins are totally different in each wiki component and its contents
- * should not be passed through the normal wiki parser. So we fetch its
- * contents completely and let each tokinzer extract names and parameters
- * from the complete token itself.
- *
- * @param ezcDocumentWikiPluginToken $plugin
- * @return void
- */
- protected function parsePluginContents( ezcDocumentWikiPluginToken $plugin )
- {
- // Match name of plugin
- if ( preg_match( '(^\\s*<(?P<type>[a-zA-Z]+)(?:\\s+(?P<params>[^>]+))?>(?P<content>.*?)\\s*</\\1>\\s*)si', $plugin->content, $match ) )
- {
- $plugin->type = strtolower( $match['type'] );
- $plugin->parameters = isset( $match['params'] ) && $match['params'] ? array( $match['params'] ) : array();
- $plugin->text = $match['content'];
- }
- }
- /**
- * Filter tokens
- *
- * Method to filter tokens, after the input string ahs been tokenized. The
- * filter should extract additional information from tokens, which are not
- * generally available yet, like the depth of a title depending on the
- * title markup.
- *
- * @param array $tokens
- * @return array
- */
- protected function filterTokens( array $tokens )
- {
- $lastImageStartToken = null;
- foreach ( $tokens as $nr => $token )
- {
- switch ( true )
- {
- // Extract the title / indentation level from the tokens
- // length.
- case $token instanceof ezcDocumentWikiTitleToken:
- $token->level = 7 - strlen( trim( $token->content ) );
- break;
- case $token instanceof ezcDocumentWikiParagraphIndentationToken:
- $token->level = strlen( trim( $token->content ) );
- break;
- case $token instanceof ezcDocumentWikiImageStartToken:
- // Check if an alignement has been specified by whitespace
- // tokens.
- $lastImageStartToken = $token;
- if ( $tokens[$next = $nr + 1] instanceof ezcDocumentWikiWhitespaceToken )
- {
- $token->alignement = 'right';
- unset( $tokens[$nr + 1] );
- ++$next;
- }
- if ( preg_match( '(\\?(?P<width>\d+)(?:x(?P<height>\d+))?$)', $tokens[$next]->content, $match ) )
- {
- $tokens[$next]->content = substr( $tokens[$next]->content, 0, -strlen( $match[0] ) );
- $token->width = isset( $match['width'] ) ? (int) $match['width'] : null;
- $token->height = isset( $match['height'] ) ? (int) $match['height'] : null;
- }
- break;
-
- case $token instanceof ezcDocumentWikiImageEndToken:
- case $token instanceof ezcDocumentWikiSeparatorToken:
- // Check if an alignement has been specified by whitespace
- // tokens.
- if ( ( $tokens[$nr - 1] instanceof ezcDocumentWikiWhitespaceToken ) &&
- ( $lastImageStartToken !== null ) )
- {
- $lastImageStartToken->alignement = $lastImageStartToken->alignement === 'right' ? 'center' : 'left';
- unset( $tokens[$nr - 1] );
- }
- $lastImageStartToken = null;
- break;
- case $token instanceof ezcDocumentWikiBulletListItemToken:
- case $token instanceof ezcDocumentWikiEnumeratedListItemToken:
- $token->indentation = substr_count( $token->content, ' ' );
- break;
- case $token instanceof ezcDocumentWikiPluginToken:
- $this->parsePluginContents( $token );
- break;
- }
- }
- return $tokens;
- }
- }
- ?>