PageRenderTime 25ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/ezc/Document/src/document/bbcode/tokenizer.php

https://bitbucket.org/crevillo/enetcall
PHP | 293 lines | 126 code | 27 blank | 140 comment | 10 complexity | 8af1720f49e47505c44e4c2646e70de6 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, LGPL-2.1
  1. <?php
  2. /**
  3. * File containing the ezcDocumentBBCodeTokenizer
  4. *
  5. * @package Document
  6. * @version //autogen//
  7. * @copyright Copyright (C) 2005-2010 eZ Systems AS. All rights reserved.
  8. * @license http://ez.no/licenses/new_bsd New BSD License
  9. */
  10. /**
  11. * Tokenizer for bbcode documents
  12. *
  13. * The tokenizer used for all bbcode documents should prepare a token array,
  14. * which can be used by the bbcode parser, without any bbcode language specific
  15. * handling in the parser itself required.
  16. *
  17. * Token extraction
  18. * ----------------
  19. *
  20. * For the token extraction the reqular expressions in the $tokens property are
  21. * used. The $tokens array has to be build like, and can be created in the
  22. * constrctor:
  23. *
  24. * <code>
  25. * array(
  26. * array(
  27. * 'class' => Class name of token,
  28. * 'match' => Regular expression to match,
  29. * ),
  30. * ...
  31. * )
  32. * </code>
  33. *
  34. * The array is evaluated in the given order, until one of the regular
  35. * expressions match. The regular expression should have at least one named
  36. * match (?P<value> ... ), with the name "value", which will be assigned to the
  37. * token, created form the given class name, as its content. The matched
  38. * contents will be removed from the beginning of the string.
  39. * Optionally a second named match, called "match", may be used inside the
  40. * regular expression. If so, only the contents inside this match will be
  41. * removed from the beginning of the string. This enables you to perform a
  42. * trivial lookahead inside the tokenizer.
  43. *
  44. * If no expression matches, an exception will be thrown.
  45. *
  46. * @package Document
  47. * @version //autogen//
  48. */
  49. class ezcDocumentBBCodeTokenizer
  50. {
  51. /**
  52. * List with tokens and a regular expression matching the given token.
  53. *
  54. * The tokens are matched in the given order.
  55. *
  56. * @var array
  57. */
  58. protected $tokens = array();
  59. /**
  60. * Common whitespace characters. The vertical tab is excluded, because it
  61. * causes strange problems with PCRE.
  62. */
  63. const WHITESPACE_CHARS = '[\\x20\\t]';
  64. /**
  65. * Characters ending a pure text section.
  66. */
  67. const TEXT_END_CHARS = '\\[\\]\\r\\n';
  68. /**
  69. * Special characters, which do have some special meaaning and though may
  70. * not have been matched otherwise.
  71. */
  72. const SPECIAL_CHARS = '\\[\\]';
  73. /**
  74. * Construct tokenizer
  75. *
  76. * Create token array with regular repression matching the respective
  77. * token.
  78. *
  79. * @return void
  80. */
  81. public function __construct()
  82. {
  83. $this->tokens = array(
  84. // Match tokens which require to be at the start of a line before
  85. // matching the actual newlines, because they are the indicator for
  86. // line starts.
  87. array(
  88. 'class' => 'ezcDocumentBBCodeLiteralBlockToken',
  89. 'match' => '(\\A(?P<match>\\[code(?:=[^\\]]+)?\\](?P<value>.+)\\[/code\\]))SUs' ),
  90. array(
  91. 'class' => 'ezcDocumentBBCodeListItemToken',
  92. 'match' => '(\\A(?P<match>\\[\\*\\]))SUs' ),
  93. array(
  94. 'class' => 'ezcDocumentBBCodeTagOpenToken',
  95. 'match' => '(\\A(?P<match>\\[(?P<value>[A-Za-z]+(?:=[^\\]]+)?)\\]))SUs' ),
  96. array(
  97. 'class' => 'ezcDocumentBBCodeTagCloseToken',
  98. 'match' => '(\\A(?P<match>\\[/(?P<value>[A-Za-z]+)\\]))SUs' ),
  99. // Whitespaces
  100. array(
  101. 'class' => 'ezcDocumentBBCodeNewLineToken',
  102. 'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
  103. array(
  104. 'class' => 'ezcDocumentBBCodeWhitespaceToken',
  105. 'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
  106. array(
  107. 'class' => 'ezcDocumentBBCodeEndOfFileToken',
  108. 'match' => '(\\A(?P<value>\\x0c))S' ),
  109. // Escape character
  110. array(
  111. 'class' => 'ezcDocumentBBCodeEscapeCharacterToken',
  112. 'match' => '(\\A(?P<value>~))S' ),
  113. // Match text except
  114. array(
  115. 'class' => 'ezcDocumentBBCodeTextLineToken',
  116. 'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),
  117. // Match all special characters, which are not valid textual chars,
  118. // but do not have been matched by any other expression.
  119. array(
  120. 'class' => 'ezcDocumentBBCodeSpecialCharsToken',
  121. 'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
  122. );
  123. }
  124. /**
  125. * Tokenize the given file
  126. *
  127. * The method tries to tokenize the passed files and returns an array of
  128. * ezcDocumentBBCodeToken struct on succes, or throws a
  129. * ezcDocumentTokenizerException, if something could not be matched by any
  130. * token.
  131. *
  132. * @param string $file
  133. * @return array
  134. */
  135. public function tokenizeFile( $file )
  136. {
  137. if ( !file_exists( $file ) || !is_readable( $file ) )
  138. {
  139. throw new ezcBaseFileNotFoundException( $file );
  140. }
  141. return $this->tokenizeString( file_get_contents( $file ) );
  142. }
  143. /**
  144. * Convert tabs to spaces
  145. *
  146. * Convert all tabs to spaces, as defined in:
  147. * http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#whitespace
  148. *
  149. * @param ezcDocumentBBCodeToken $token
  150. * @return void
  151. */
  152. protected function convertTabs( ezcDocumentBBCodeToken $token )
  153. {
  154. while ( ( $position = strpos( $token->content, "\t" ) ) !== false )
  155. {
  156. $token->content =
  157. substr( $token->content, 0, $position ) .
  158. str_repeat( ' ', 9 - ( ( $position + $token->position ) % 8 ) ) .
  159. substr( $token->content, $position + 1 );
  160. }
  161. }
  162. /**
  163. * Tokenize the given string
  164. *
  165. * The method tries to tokenize the passed strings and returns an array of
  166. * ezcDocumentBBCodeToken struct on succes, or throws a
  167. * ezcDocumentTokenizerException, if something could not be matched by any
  168. * token.
  169. *
  170. * @param string $string
  171. * @return array
  172. */
  173. public function tokenizeString( $string )
  174. {
  175. $line = 1;
  176. $position = 1;
  177. $tokens = array();
  178. // Normalize newlines
  179. $string = preg_replace( '([\x20\\t]*(?:\\r\\n|\\r|\\n))', "\n", $string );
  180. while ( strlen( $string ) > 0 )
  181. {
  182. foreach ( $this->tokens as $match )
  183. {
  184. if ( preg_match( $match['match'], $string, $matches ) )
  185. {
  186. // If the first part of the match is a
  187. // newline, add a respective token to the
  188. // stack.
  189. if ( ( $matches[0][0] === "\n" ) &&
  190. ( $match['class'] !== 'ezcDocumentBBCodeNewLineToken' ) )
  191. {
  192. $tokens[] = new ezcDocumentBBCodeNewLineToken( $matches[0][0], $line, $position );
  193. ++$line;
  194. $position = 0;
  195. }
  196. // A token matched, so add the matched token to the token
  197. // list and update all variables.
  198. $class = $match['class'];
  199. $newToken = new $class(
  200. ( isset( $matches['value'] ) ? $matches['value'] : null ),
  201. $line,
  202. $position
  203. );
  204. $match = isset( $matches['match'] ) ? $matches['match'] : $matches[0];
  205. // Removed matched stuff from input string
  206. $string = substr( $string, $length = strlen( $match ) );
  207. // On a newline token reset the line position and increase the line value
  208. if ( $newToken instanceof ezcDocumentBBCodeNewLineToken )
  209. {
  210. ++$line;
  211. $position = 0;
  212. }
  213. else
  214. {
  215. // Otherwise still update the line
  216. // value, when there is at minimum
  217. // one newline in the match. This may
  218. // lead to a false position value.
  219. if ( ( $newLines = substr_count( $match, "\n" ) ) > 0 )
  220. {
  221. $line += $newLines;
  222. $position = 0;
  223. }
  224. }
  225. // Convert tabs to spaces for whitespace tokens
  226. if ( $newToken instanceof ezcDocumentBBCodeWhitespaceToken )
  227. {
  228. $this->convertTabs( $newToken );
  229. }
  230. // If we found an explicit EOF token, just exit the parsing process.
  231. if ( $newToken instanceof ezcDocumentBBCodeEndOfFileToken )
  232. {
  233. break 2;
  234. }
  235. // Add token to extracted token list
  236. $tokens[] = $newToken;
  237. // Update position, not before converting tabs to spaces.
  238. $position += ( $newToken instanceof ezcDocumentBBCodeNewLineToken ) ? 1 : strlen( $newToken->content );
  239. // Restart the while loop, because we matched a token and
  240. // can retry with shortened string.
  241. continue 2;
  242. }
  243. }
  244. // None of the token definitions matched the input string. We throw
  245. // an exception with the position of the content in the input
  246. // string and the contents we could not match.
  247. //
  248. // This should never been thrown, but it is hard to prove that
  249. // there is nothing which is not matched by the regualr expressions
  250. // above.
  251. throw new ezcDocumentBBCodeTokenizerException(
  252. $line,
  253. $position,
  254. $string
  255. );
  256. }
  257. // Finally append ainother newline token and a end of file token, to
  258. // make parsing the end easier.
  259. $tokens[] = new ezcDocumentBBCodeNewLineToken( "\n", $line, $position );
  260. $tokens[] = new ezcDocumentBBCodeNewLineToken( "\n", $line, $position );
  261. $tokens[] = new ezcDocumentBBCodeEndOfFileToken( null, $line, $position );
  262. return $tokens;
  263. }
  264. }
  265. ?>