/lib/ezc/Document/src/document/bbcode/tokenizer.php
PHP | 293 lines | 126 code | 27 blank | 140 comment | 10 complexity | 8af1720f49e47505c44e4c2646e70de6 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, LGPL-2.1
- <?php
- /**
- * File containing the ezcDocumentBBCodeTokenizer
- *
- * @package Document
- * @version //autogen//
- * @copyright Copyright (C) 2005-2010 eZ Systems AS. All rights reserved.
- * @license http://ez.no/licenses/new_bsd New BSD License
- */
- /**
- * Tokenizer for bbcode documents
- *
- * The tokenizer used for all bbcode documents should prepare a token array,
- * which can be used by the bbcode parser, without any bbcode language specific
- * handling in the parser itself required.
- *
- * Token extraction
- * ----------------
- *
- * For the token extraction the reqular expressions in the $tokens property are
- * used. The $tokens array has to be build like, and can be created in the
- * constrctor:
- *
- * <code>
- * array(
- * array(
- * 'class' => Class name of token,
- * 'match' => Regular expression to match,
- * ),
- * ...
- * )
- * </code>
- *
- * The array is evaluated in the given order, until one of the regular
- * expressions match. The regular expression should have at least one named
- * match (?P<value> ... ), with the name "value", which will be assigned to the
- * token, created form the given class name, as its content. The matched
- * contents will be removed from the beginning of the string.
- * Optionally a second named match, called "match", may be used inside the
- * regular expression. If so, only the contents inside this match will be
- * removed from the beginning of the string. This enables you to perform a
- * trivial lookahead inside the tokenizer.
- *
- * If no expression matches, an exception will be thrown.
- *
- * @package Document
- * @version //autogen//
- */
- class ezcDocumentBBCodeTokenizer
- {
- /**
- * List with tokens and a regular expression matching the given token.
- *
- * The tokens are matched in the given order.
- *
- * @var array
- */
- protected $tokens = array();
- /**
- * Common whitespace characters. The vertical tab is excluded, because it
- * causes strange problems with PCRE.
- */
- const WHITESPACE_CHARS = '[\\x20\\t]';
- /**
- * Characters ending a pure text section.
- */
- const TEXT_END_CHARS = '\\[\\]\\r\\n';
- /**
- * Special characters, which do have some special meaaning and though may
- * not have been matched otherwise.
- */
- const SPECIAL_CHARS = '\\[\\]';
- /**
- * Construct tokenizer
- *
- * Create token array with regular repression matching the respective
- * token.
- *
- * @return void
- */
- public function __construct()
- {
- $this->tokens = array(
- // Match tokens which require to be at the start of a line before
- // matching the actual newlines, because they are the indicator for
- // line starts.
- array(
- 'class' => 'ezcDocumentBBCodeLiteralBlockToken',
- 'match' => '(\\A(?P<match>\\[code(?:=[^\\]]+)?\\](?P<value>.+)\\[/code\\]))SUs' ),
- array(
- 'class' => 'ezcDocumentBBCodeListItemToken',
- 'match' => '(\\A(?P<match>\\[\\*\\]))SUs' ),
- array(
- 'class' => 'ezcDocumentBBCodeTagOpenToken',
- 'match' => '(\\A(?P<match>\\[(?P<value>[A-Za-z]+(?:=[^\\]]+)?)\\]))SUs' ),
- array(
- 'class' => 'ezcDocumentBBCodeTagCloseToken',
- 'match' => '(\\A(?P<match>\\[/(?P<value>[A-Za-z]+)\\]))SUs' ),
- // Whitespaces
- array(
- 'class' => 'ezcDocumentBBCodeNewLineToken',
- 'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
- array(
- 'class' => 'ezcDocumentBBCodeWhitespaceToken',
- 'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
- array(
- 'class' => 'ezcDocumentBBCodeEndOfFileToken',
- 'match' => '(\\A(?P<value>\\x0c))S' ),
- // Escape character
- array(
- 'class' => 'ezcDocumentBBCodeEscapeCharacterToken',
- 'match' => '(\\A(?P<value>~))S' ),
- // Match text except
- array(
- 'class' => 'ezcDocumentBBCodeTextLineToken',
- 'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),
- // Match all special characters, which are not valid textual chars,
- // but do not have been matched by any other expression.
- array(
- 'class' => 'ezcDocumentBBCodeSpecialCharsToken',
- 'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
- );
- }
- /**
- * Tokenize the given file
- *
- * The method tries to tokenize the passed files and returns an array of
- * ezcDocumentBBCodeToken struct on succes, or throws a
- * ezcDocumentTokenizerException, if something could not be matched by any
- * token.
- *
- * @param string $file
- * @return array
- */
- public function tokenizeFile( $file )
- {
- if ( !file_exists( $file ) || !is_readable( $file ) )
- {
- throw new ezcBaseFileNotFoundException( $file );
- }
- return $this->tokenizeString( file_get_contents( $file ) );
- }
- /**
- * Convert tabs to spaces
- *
- * Convert all tabs to spaces, as defined in:
- * http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#whitespace
- *
- * @param ezcDocumentBBCodeToken $token
- * @return void
- */
- protected function convertTabs( ezcDocumentBBCodeToken $token )
- {
- while ( ( $position = strpos( $token->content, "\t" ) ) !== false )
- {
- $token->content =
- substr( $token->content, 0, $position ) .
- str_repeat( ' ', 9 - ( ( $position + $token->position ) % 8 ) ) .
- substr( $token->content, $position + 1 );
- }
- }
- /**
- * Tokenize the given string
- *
- * The method tries to tokenize the passed strings and returns an array of
- * ezcDocumentBBCodeToken struct on succes, or throws a
- * ezcDocumentTokenizerException, if something could not be matched by any
- * token.
- *
- * @param string $string
- * @return array
- */
- public function tokenizeString( $string )
- {
- $line = 1;
- $position = 1;
- $tokens = array();
- // Normalize newlines
- $string = preg_replace( '([\x20\\t]*(?:\\r\\n|\\r|\\n))', "\n", $string );
- while ( strlen( $string ) > 0 )
- {
- foreach ( $this->tokens as $match )
- {
- if ( preg_match( $match['match'], $string, $matches ) )
- {
- // If the first part of the match is a
- // newline, add a respective token to the
- // stack.
- if ( ( $matches[0][0] === "\n" ) &&
- ( $match['class'] !== 'ezcDocumentBBCodeNewLineToken' ) )
- {
- $tokens[] = new ezcDocumentBBCodeNewLineToken( $matches[0][0], $line, $position );
- ++$line;
- $position = 0;
- }
- // A token matched, so add the matched token to the token
- // list and update all variables.
- $class = $match['class'];
- $newToken = new $class(
- ( isset( $matches['value'] ) ? $matches['value'] : null ),
- $line,
- $position
- );
- $match = isset( $matches['match'] ) ? $matches['match'] : $matches[0];
- // Removed matched stuff from input string
- $string = substr( $string, $length = strlen( $match ) );
- // On a newline token reset the line position and increase the line value
- if ( $newToken instanceof ezcDocumentBBCodeNewLineToken )
- {
- ++$line;
- $position = 0;
- }
- else
- {
- // Otherwise still update the line
- // value, when there is at minimum
- // one newline in the match. This may
- // lead to a false position value.
- if ( ( $newLines = substr_count( $match, "\n" ) ) > 0 )
- {
- $line += $newLines;
- $position = 0;
- }
- }
- // Convert tabs to spaces for whitespace tokens
- if ( $newToken instanceof ezcDocumentBBCodeWhitespaceToken )
- {
- $this->convertTabs( $newToken );
- }
- // If we found an explicit EOF token, just exit the parsing process.
- if ( $newToken instanceof ezcDocumentBBCodeEndOfFileToken )
- {
- break 2;
- }
- // Add token to extracted token list
- $tokens[] = $newToken;
- // Update position, not before converting tabs to spaces.
- $position += ( $newToken instanceof ezcDocumentBBCodeNewLineToken ) ? 1 : strlen( $newToken->content );
- // Restart the while loop, because we matched a token and
- // can retry with shortened string.
- continue 2;
- }
- }
- // None of the token definitions matched the input string. We throw
- // an exception with the position of the content in the input
- // string and the contents we could not match.
- //
- // This should never been thrown, but it is hard to prove that
- // there is nothing which is not matched by the regualr expressions
- // above.
- throw new ezcDocumentBBCodeTokenizerException(
- $line,
- $position,
- $string
- );
- }
- // Finally append ainother newline token and a end of file token, to
- // make parsing the end easier.
- $tokens[] = new ezcDocumentBBCodeNewLineToken( "\n", $line, $position );
- $tokens[] = new ezcDocumentBBCodeNewLineToken( "\n", $line, $position );
- $tokens[] = new ezcDocumentBBCodeEndOfFileToken( null, $line, $position );
- return $tokens;
- }
- }
- ?>