PageRenderTime 77ms CodeModel.GetById 49ms RepoModel.GetById 0ms app.codeStats 0ms

/Document/src/document/wiki/tokenizer.php

https://github.com/oluwalataz/zetacomponents
PHP | 277 lines | 91 code | 22 blank | 164 comment | 10 complexity | 2b83c5ff0610bcb1f367de5f8aef224d MD5 | raw file
  1. <?php
  2. /**
  3. * File containing the ezcDocumentWikiTokenizer
  4. *
  5. * Licensed to the Apache Software Foundation (ASF) under one
  6. * or more contributor license agreements. See the NOTICE file
  7. * distributed with this work for additional information
  8. * regarding copyright ownership. The ASF licenses this file
  9. * to you under the Apache License, Version 2.0 (the
  10. * "License"); you may not use this file except in compliance
  11. * with the License. You may obtain a copy of the License at
  12. *
  13. * http://www.apache.org/licenses/LICENSE-2.0
  14. *
  15. * Unless required by applicable law or agreed to in writing,
  16. * software distributed under the License is distributed on an
  17. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  18. * KIND, either express or implied. See the License for the
  19. * specific language governing permissions and limitations
  20. * under the License.
  21. *
  22. * @package Document
  23. * @version //autogen//
  24. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
  25. */
  26. /**
  27. * Tokenizer for wiki documents
  28. *
  29. * The tokenizer used for all wiki documents should prepare a token array,
  30. * which can be used by the wiki parser, without any wiki language specific
  31. * handling in the parser itself required. For this the tokenizing is performed
  32. * in two steps:
  33. *
  34. * 1) Extract tokens from text
  35. * 2) Filter tokens
  36. *
  37. * Token extraction
  38. * ----------------
  39. *
  40. * For the token extraction the reqular expressions in the $tokens property are
  41. * used. The $tokens array has to be build like, and can be created in the
  42. * constrctor:
  43. *
  44. * <code>
  45. * array(
  46. * array(
  47. * 'class' => Class name of token,
  48. * 'match' => Regular expression to match,
  49. * ),
  50. * ...
  51. * )
  52. * </code>
  53. *
  54. * The array is evaluated in the given order, until one of the regular
  55. * expressions match. The regular expression should have at least one named
  56. * match (?P<value> ... ), with the name "value", which will be assigned to the
  57. * token, created form the given class name, as its content. The matched
  58. * contents will be removed from the beginning of the string.
  59. * Optionally a second named match, called "match", may be used inside the
  60. * regular expression. If so, only the contents inside this match will be
  61. * removed from the beginning of the string. This enables you to perform a
  62. * trivial lookahead inside the tokenizer.
  63. *
  64. * If no expression matches, an exception will be thrown.
  65. *
  66. * Token filtering
  67. * ---------------
  68. *
  69. * After all tokens are extracted from the text, they may miss some values,
  70. * which may be required by the parser, like the level of title tokens. Those
  71. * should be extracted and assigned during the filtering stage. For this the
  72. * filterTokens() method should be implemented, which may iterate over the
  73. * token stream and assign the required values.
  74. *
  75. * If the wiki markup language supports plugins you may also want to "parse"
  76. * the plugin contents to extract type, parameters and its text here.
  77. *
  78. * @package Document
  79. * @version //autogen//
  80. */
  81. abstract class ezcDocumentWikiTokenizer
  82. {
  83. /**
  84. * List with tokens and a regular expression matching the given token.
  85. *
  86. * The tokens are matched in the given order.
  87. *
  88. * @var array
  89. */
  90. protected $tokens = array();
  91. /**
  92. * Construct tokenizer
  93. *
  94. * Create token array with regular repression matching the respective
  95. * token.
  96. *
  97. * @return void
  98. */
  99. abstract public function __construct();
  100. /**
  101. * Tokenize the given file
  102. *
  103. * The method tries to tokenize the passed files and returns an array of
  104. * ezcDocumentWikiToken struct on succes, or throws a
  105. * ezcDocumentTokenizerException, if something could not be matched by any
  106. * token.
  107. *
  108. * @param string $file
  109. * @return array
  110. */
  111. public function tokenizeFile( $file )
  112. {
  113. if ( !file_exists( $file ) || !is_readable( $file ) )
  114. {
  115. throw new ezcBaseFileNotFoundException( $file );
  116. }
  117. return $this->tokenizeString( file_get_contents( $file ) );
  118. }
  119. /**
  120. * Convert tabs to spaces
  121. *
  122. * Convert all tabs to spaces, as defined in:
  123. * http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#whitespace
  124. *
  125. * @param ezcDocumentWikiToken $token
  126. * @return void
  127. */
  128. protected function convertTabs( ezcDocumentWikiToken $token )
  129. {
  130. while ( ( $position = strpos( $token->content, "\t" ) ) !== false )
  131. {
  132. $token->content =
  133. substr( $token->content, 0, $position ) .
  134. str_repeat( ' ', 9 - ( ( $position + $token->position ) % 8 ) ) .
  135. substr( $token->content, $position + 1 );
  136. }
  137. }
  138. /**
  139. * Filter tokens
  140. *
  141. * Method to filter tokens, after the input string ahs been tokenized. The
  142. * filter should extract additional information from tokens, which are not
  143. * generally available yet, like the depth of a title depending on the
  144. * title markup.
  145. *
  146. * @param array $tokens
  147. * @return array
  148. */
  149. abstract protected function filterTokens( array $tokens );
  150. /**
  151. * Tokenize the given string
  152. *
  153. * The method tries to tokenize the passed strings and returns an array of
  154. * ezcDocumentWikiToken struct on succes, or throws a
  155. * ezcDocumentTokenizerException, if something could not be matched by any
  156. * token.
  157. *
  158. * @param string $string
  159. * @return array
  160. */
  161. public function tokenizeString( $string )
  162. {
  163. $line = 0;
  164. $position = 1;
  165. $tokens = array();
  166. $string = "\n" . $string;
  167. // Normalize newlines
  168. $string = preg_replace( '([\x20\\t]*(?:\\r\\n|\\r|\\n))', "\n", $string );
  169. while ( strlen( $string ) > 0 )
  170. {
  171. foreach ( $this->tokens as $match )
  172. {
  173. if ( preg_match( $match['match'], $string, $matches ) )
  174. {
  175. // If the first part of the match is a
  176. // newline, add a respective token to the
  177. // stack.
  178. if ( ( $matches[0][0] === "\n" ) &&
  179. ( $match['class'] !== 'ezcDocumentWikiNewLineToken' ) )
  180. {
  181. $tokens[] = new ezcDocumentWikiNewLineToken( $matches[0][0], $line, $position );
  182. ++$line;
  183. $position = 0;
  184. }
  185. // A token matched, so add the matched token to the token
  186. // list and update all variables.
  187. $class = $match['class'];
  188. $newToken = new $class(
  189. ( isset( $matches['value'] ) ? $matches['value'] : null ),
  190. $line,
  191. $position
  192. );
  193. $match = isset( $matches['match'] ) ? $matches['match'] : $matches[0];
  194. // Removed matched stuff from input string
  195. $string = substr( $string, $length = strlen( $match ) );
  196. // On a newline token reset the line position and increase the line value
  197. if ( $newToken instanceof ezcDocumentWikiNewLineToken )
  198. {
  199. ++$line;
  200. $position = 0;
  201. }
  202. else
  203. {
  204. // Otherwise still update the line
  205. // value, when there is at minimum
  206. // one newline in the match. This may
  207. // lead to a false position value.
  208. if ( ( $newLines = substr_count( $match, "\n" ) ) > 0 )
  209. {
  210. $line += $newLines;
  211. $position = 0;
  212. }
  213. }
  214. // Convert tabs to spaces for whitespace tokens
  215. if ( $newToken instanceof ezcDocumentWikiWhitespaceToken )
  216. {
  217. $this->convertTabs( $newToken );
  218. }
  219. // If we found an explicit EOF token, just exit the parsing process.
  220. if ( $newToken instanceof ezcDocumentWikiEndOfFileToken )
  221. {
  222. break 2;
  223. }
  224. // Add token to extracted token list
  225. $tokens[] = $newToken;
  226. // Update position, not before converting tabs to spaces.
  227. $position += ( $newToken instanceof ezcDocumentWikiNewLineToken ) ? 1 : strlen( $newToken->content );
  228. // Restart the while loop, because we matched a token and
  229. // can retry with shortened string.
  230. continue 2;
  231. }
  232. }
  233. // None of the token definitions matched the input string. We throw
  234. // an exception with the position of the content in the input
  235. // string and the contents we could not match.
  236. //
  237. // This should never been thrown, but it is hard to prove that
  238. // there is nothing which is not matched by the regualr expressions
  239. // above.
  240. throw new ezcDocumentWikiTokenizerException(
  241. $line,
  242. $position,
  243. $string
  244. );
  245. }
  246. // Finally append ainother newline token and a end of file token, to
  247. // make parsing the end easier.
  248. $tokens[] = new ezcDocumentWikiNewLineToken( "\n", $line, $position );
  249. $tokens[] = new ezcDocumentWikiNewLineToken( "\n", $line, $position );
  250. $tokens[] = new ezcDocumentWikiEndOfFileToken( null, $line, $position );
  251. return $this->filterTokens( $tokens );
  252. }
  253. }
  254. ?>