/protected/components/ezcomponents/Document/src/document/wiki/tokenizer.php

https://github.com/kamarulismail/kamarul-playground · PHP · 261 lines · 91 code · 22 blank · 148 comment · 10 complexity · 53215b79bc6ad3ada45bd7223e5311bc MD5 · raw file

  1. <?php
  2. /**
  3. * File containing the ezcDocumentWikiTokenizer
  4. *
  5. * @package Document
  6. * @version 1.3.1
  7. * @copyright Copyright (C) 2005-2010 eZ Systems AS. All rights reserved.
  8. * @license http://ez.no/licenses/new_bsd New BSD License
  9. */
  10. /**
  11. * Tokenizer for wiki documents
  12. *
  13. * The tokenizer used for all wiki documents should prepare a token array,
  14. * which can be used by the wiki parser, without any wiki language specific
  15. * handling in the parser itself required. For this the tokenizing is performed
  16. * in two steps:
  17. *
  18. * 1) Extract tokens from text
  19. * 2) Filter tokens
  20. *
  21. * Token extraction
  22. * ----------------
  23. *
  24. * For the token extraction the reqular expressions in the $tokens property are
  25. * used. The $tokens array has to be build like, and can be created in the
  26. * constrctor:
  27. *
  28. * <code>
  29. * array(
  30. * array(
  31. * 'class' => Class name of token,
  32. * 'match' => Regular expression to match,
  33. * ),
  34. * ...
  35. * )
  36. * </code>
  37. *
  38. * The array is evaluated in the given order, until one of the regular
  39. * expressions match. The regular expression should have at least one named
  40. * match (?P<value> ... ), with the name "value", which will be assigned to the
  41. * token, created form the given class name, as its content. The matched
  42. * contents will be removed from the beginning of the string.
  43. * Optionally a second named match, called "match", may be used inside the
  44. * regular expression. If so, only the contents inside this match will be
  45. * removed from the beginning of the string. This enables you to perform a
  46. * trivial lookahead inside the tokenizer.
  47. *
  48. * If no expression matches, an exception will be thrown.
  49. *
  50. * Token filtering
  51. * ---------------
  52. *
  53. * After all tokens are extracted from the text, they may miss some values,
  54. * which may be required by the parser, like the level of title tokens. Those
  55. * should be extracted and assigned during the filtering stage. For this the
  56. * filterTokens() method should be implemented, which may iterate over the
  57. * token stream and assign the required values.
  58. *
  59. * If the wiki markup language supports plugins you may also want to "parse"
  60. * the plugin contents to extract type, parameters and its text here.
  61. *
  62. * @package Document
  63. * @version 1.3.1
  64. */
  65. abstract class ezcDocumentWikiTokenizer
  66. {
  67. /**
  68. * List with tokens and a regular expression matching the given token.
  69. *
  70. * The tokens are matched in the given order.
  71. *
  72. * @var array
  73. */
  74. protected $tokens = array();
  75. /**
  76. * Construct tokenizer
  77. *
  78. * Create token array with regular repression matching the respective
  79. * token.
  80. *
  81. * @return void
  82. */
  83. abstract public function __construct();
  84. /**
  85. * Tokenize the given file
  86. *
  87. * The method tries to tokenize the passed files and returns an array of
  88. * ezcDocumentWikiToken struct on succes, or throws a
  89. * ezcDocumentTokenizerException, if something could not be matched by any
  90. * token.
  91. *
  92. * @param string $file
  93. * @return array
  94. */
  95. public function tokenizeFile( $file )
  96. {
  97. if ( !file_exists( $file ) || !is_readable( $file ) )
  98. {
  99. throw new ezcBaseFileNotFoundException( $file );
  100. }
  101. return $this->tokenizeString( file_get_contents( $file ) );
  102. }
  103. /**
  104. * Convert tabs to spaces
  105. *
  106. * Convert all tabs to spaces, as defined in:
  107. * http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#whitespace
  108. *
  109. * @param ezcDocumentWikiToken $token
  110. * @return void
  111. */
  112. protected function convertTabs( ezcDocumentWikiToken $token )
  113. {
  114. while ( ( $position = strpos( $token->content, "\t" ) ) !== false )
  115. {
  116. $token->content =
  117. substr( $token->content, 0, $position ) .
  118. str_repeat( ' ', 9 - ( ( $position + $token->position ) % 8 ) ) .
  119. substr( $token->content, $position + 1 );
  120. }
  121. }
  122. /**
  123. * Filter tokens
  124. *
  125. * Method to filter tokens, after the input string ahs been tokenized. The
  126. * filter should extract additional information from tokens, which are not
  127. * generally available yet, like the depth of a title depending on the
  128. * title markup.
  129. *
  130. * @param array $tokens
  131. * @return array
  132. */
  133. abstract protected function filterTokens( array $tokens );
  134. /**
  135. * Tokenize the given string
  136. *
  137. * The method tries to tokenize the passed strings and returns an array of
  138. * ezcDocumentWikiToken struct on succes, or throws a
  139. * ezcDocumentTokenizerException, if something could not be matched by any
  140. * token.
  141. *
  142. * @param string $string
  143. * @return array
  144. */
  145. public function tokenizeString( $string )
  146. {
  147. $line = 0;
  148. $position = 1;
  149. $tokens = array();
  150. $string = "\n" . $string;
  151. // Normalize newlines
  152. $string = preg_replace( '([\x20\\t]*(?:\\r\\n|\\r|\\n))', "\n", $string );
  153. while ( strlen( $string ) > 0 )
  154. {
  155. foreach ( $this->tokens as $match )
  156. {
  157. if ( preg_match( $match['match'], $string, $matches ) )
  158. {
  159. // If the first part of the match is a
  160. // newline, add a respective token to the
  161. // stack.
  162. if ( ( $matches[0][0] === "\n" ) &&
  163. ( $match['class'] !== 'ezcDocumentWikiNewLineToken' ) )
  164. {
  165. $tokens[] = new ezcDocumentWikiNewLineToken( $matches[0][0], $line, $position );
  166. ++$line;
  167. $position = 0;
  168. }
  169. // A token matched, so add the matched token to the token
  170. // list and update all variables.
  171. $class = $match['class'];
  172. $newToken = new $class(
  173. ( isset( $matches['value'] ) ? $matches['value'] : null ),
  174. $line,
  175. $position
  176. );
  177. $match = isset( $matches['match'] ) ? $matches['match'] : $matches[0];
  178. // Removed matched stuff from input string
  179. $string = substr( $string, $length = strlen( $match ) );
  180. // On a newline token reset the line position and increase the line value
  181. if ( $newToken instanceof ezcDocumentWikiNewLineToken )
  182. {
  183. ++$line;
  184. $position = 0;
  185. }
  186. else
  187. {
  188. // Otherwise still update the line
  189. // value, when there is at minimum
  190. // one newline in the match. This may
  191. // lead to a false position value.
  192. if ( ( $newLines = substr_count( $match, "\n" ) ) > 0 )
  193. {
  194. $line += $newLines;
  195. $position = 0;
  196. }
  197. }
  198. // Convert tabs to spaces for whitespace tokens
  199. if ( $newToken instanceof ezcDocumentWikiWhitespaceToken )
  200. {
  201. $this->convertTabs( $newToken );
  202. }
  203. // If we found an explicit EOF token, just exit the parsing process.
  204. if ( $newToken instanceof ezcDocumentWikiEndOfFileToken )
  205. {
  206. break 2;
  207. }
  208. // Add token to extracted token list
  209. $tokens[] = $newToken;
  210. // Update position, not before converting tabs to spaces.
  211. $position += ( $newToken instanceof ezcDocumentWikiNewLineToken ) ? 1 : strlen( $newToken->content );
  212. // Restart the while loop, because we matched a token and
  213. // can retry with shortened string.
  214. continue 2;
  215. }
  216. }
  217. // None of the token definitions matched the input string. We throw
  218. // an exception with the position of the content in the input
  219. // string and the contents we could not match.
  220. //
  221. // This should never been thrown, but it is hard to prove that
  222. // there is nothing which is not matched by the regualr expressions
  223. // above.
  224. throw new ezcDocumentWikiTokenizerException(
  225. $line,
  226. $position,
  227. $string
  228. );
  229. }
  230. // Finally append ainother newline token and a end of file token, to
  231. // make parsing the end easier.
  232. $tokens[] = new ezcDocumentWikiNewLineToken( "\n", $line, $position );
  233. $tokens[] = new ezcDocumentWikiNewLineToken( "\n", $line, $position );
  234. $tokens[] = new ezcDocumentWikiEndOfFileToken( null, $line, $position );
  235. return $this->filterTokens( $tokens );
  236. }
  237. }
  238. ?>