PageRenderTime 24ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/atlas/engine/ezc/Document/src/document/wiki/tokenizer/dokuwiki.php

https://github.com/jacomyma/GEXF-Atlas
PHP | 286 lines | 183 code | 19 blank | 84 comment | 6 complexity | 300d80125270fbae4905ea0831c83d81 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. <?php
  2. /**
  3. * File containing the ezcDocumentWikiDokuwikiTokenizer
  4. *
  5. * @package Document
  6. * @version 1.1.2
  7. * @copyright Copyright (C) 2005-2009 eZ Systems AS. All rights reserved.
  8. * @license http://ez.no/licenses/new_bsd New BSD License
  9. */
  10. /**
  11. * Tokenizer for Dokuwiki wiki documents.
  12. *
  13. * The Dokuwiki wiki is a very popular wiki, which for example is currently
  14. * used at http://wiki.php.net. The Dokuwiki syntax definition can be found at:
  15. *
  16. * http://www.dokuwiki.org/syntax
  17. *
  18. * For the basic workings of the tokenizer see the class level documentation in
  19. * the ezcDocumentWikiTokenizer class.
  20. *
  21. * @package Document
  22. * @version 1.1.2
  23. */
  24. class ezcDocumentWikiDokuwikiTokenizer extends ezcDocumentWikiTokenizer
  25. {
  26. /**
  27. * Common whitespace characters. The vertical tab is excluded, because it
  28. * causes strange problems with PCRE.
  29. */
  30. const WHITESPACE_CHARS = '[\\x20\\t]';
  31. /**
  32. * Characters ending a pure text section.
  33. */
  34. const TEXT_END_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=\\r\\n\\t\\x20';
  35. /**
  36. * Special characters, which do have some special meaaning and though may
  37. * not have been matched otherwise.
  38. */
  39. const SPECIAL_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=';
  40. /**
  41. * Construct tokenizer
  42. *
  43. * Create token array with regular repression matching the respective
  44. * token.
  45. *
  46. * @return void
  47. */
  48. public function __construct()
  49. {
  50. $this->tokens = array(
  51. // Match tokens which require to be at the start of a line before
  52. // matching the actual newlines, because they are the indicator for
  53. // line starts.
  54. array(
  55. 'class' => 'ezcDocumentWikiTitleToken',
  56. 'match' => '(\\A(?P<match>(?:\\n|' . self::WHITESPACE_CHARS . '+)(?P<value>={2,6}))(?:\\n|' . self::WHITESPACE_CHARS . '+))S' ),
  57. array(
  58. 'class' => 'ezcDocumentWikiBulletListItemToken',
  59. 'match' => '(\\A\\n(?P<value>\\x20*\\*)' . self::WHITESPACE_CHARS . '+)S' ),
  60. array(
  61. 'class' => 'ezcDocumentWikiEnumeratedListItemToken',
  62. 'match' => '(\\A\\n(?P<value>\\x20*-)' . self::WHITESPACE_CHARS . '+)S' ),
  63. array(
  64. 'class' => 'ezcDocumentWikiLiteralBlockToken',
  65. 'match' => '(\\A(?P<match>\\n<(code|file)>\\n(?P<value>.+)\\n</\\2>)\\n)SUsi' ),
  66. array(
  67. 'class' => 'ezcDocumentWikiLiteralBlockToken',
  68. 'match' => '(\\A(?P<match>\\n(?P<value>(' . self::WHITESPACE_CHARS . '+).*\n(?:\\3.*\n)*)))S' ),
  69. array(
  70. 'class' => 'ezcDocumentWikiTextLineToken',
  71. 'match' => '(\\A(?P<match>\\n<nowiki>\\n(?P<value>.+)\\n</nowiki>)\\n)SUsi' ),
  72. array(
  73. 'class' => 'ezcDocumentWikiTableRowToken',
  74. 'match' => '(\\A(?P<match>\\n)(?P<value>[|^]))S' ),
  75. array(
  76. 'class' => 'ezcDocumentWikiParagraphIndentationToken',
  77. 'match' => '(\\A\\n(?P<value>>+)' . self::WHITESPACE_CHARS . '*)S' ),
  78. // Whitespaces
  79. array(
  80. 'class' => 'ezcDocumentWikiNewLineToken',
  81. 'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
  82. array(
  83. 'class' => 'ezcDocumentWikiWhitespaceToken',
  84. 'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
  85. array(
  86. 'class' => 'ezcDocumentWikiEndOfFileToken',
  87. 'match' => '(\\A(?P<value>\\x0c))S' ),
  88. // Escape character
  89. /*
  90. array(
  91. 'class' => 'ezcDocumentWikiEscapeCharacterToken',
  92. 'match' => '(\\A(?P<value>~))S' ),
  93. // */
  94. // Inline markup
  95. array(
  96. 'class' => 'ezcDocumentWikiBoldToken',
  97. 'match' => '(\\A(?P<value>\\*\\*))S' ),
  98. array(
  99. 'class' => 'ezcDocumentWikiItalicToken',
  100. 'match' => '(\\A(?P<value>//))S' ),
  101. array(
  102. 'class' => 'ezcDocumentWikiMonospaceToken',
  103. 'match' => '(\\A(?P<value>\'\'))S' ),
  104. array(
  105. 'class' => 'ezcDocumentWikiSuperscriptToken',
  106. 'match' => '(\\A(?P<value></?sup>))Si' ),
  107. array(
  108. 'class' => 'ezcDocumentWikiSubscriptToken',
  109. 'match' => '(\\A(?P<value></?sub>))Si' ),
  110. array(
  111. 'class' => 'ezcDocumentWikiUnderlineToken',
  112. 'match' => '(\\A(?P<value>__))S' ),
  113. array(
  114. 'class' => 'ezcDocumentWikiDeletedToken',
  115. 'match' => '(\\A(?P<value></?del>))Si' ),
  116. array(
  117. 'class' => 'ezcDocumentWikiInlineLiteralToken',
  118. 'match' => '(\\A<nowiki>(?P<value>.*)</nowiki>)SUi' ),
  119. array(
  120. 'class' => 'ezcDocumentWikiTextLineToken',
  121. 'match' => '(\\A%%(?P<value>.*)%%)SUi' ),
  122. array(
  123. 'class' => 'ezcDocumentWikiLineBreakToken',
  124. 'match' => '(\\A(?P<match>(?P<value>\\\\\\\\))(?:' . self::WHITESPACE_CHARS . '|\\n))S' ),
  125. array(
  126. 'class' => 'ezcDocumentWikiLinkStartToken',
  127. 'match' => '(\\A(?P<value>\\[\\[))S' ),
  128. array(
  129. 'class' => 'ezcDocumentWikiLinkEndToken',
  130. 'match' => '(\\A(?P<value>\\]\\]))S' ),
  131. array(
  132. 'class' => 'ezcDocumentWikiSeparatorToken',
  133. 'match' => '(\\A(?P<value>\\||' . self::WHITESPACE_CHARS . '*->' . self::WHITESPACE_CHARS . '*))S' ),
  134. array(
  135. 'class' => 'ezcDocumentWikiExternalLinkToken',
  136. 'match' => '(\\A
  137. (?P<match>
  138. (?P<value>
  139. # Match common URLs
  140. [a-z]+://\S+? |
  141. # Match mail addresses enclosed by <>
  142. <[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?>
  143. )
  144. # Greedy match on text end chars, which should NOT be included in URLs
  145. )[,.?!:;"\']?(?:' . self::WHITESPACE_CHARS . '|\\n|\\||]]|\\}\\}|$)
  146. )Sx' ),
  147. array(
  148. 'class' => 'ezcDocumentWikiInterWikiLinkToken',
  149. 'match' => '(\\A(?P<value>([A-Za-z]+)>[^\\]|]+))S' ),
  150. array(
  151. 'class' => 'ezcDocumentWikiImageStartToken',
  152. 'match' => '(\\A(?P<value>\\{\\{))S' ),
  153. array(
  154. 'class' => 'ezcDocumentWikiImageEndToken',
  155. 'match' => '(\\A(?P<value>\\}\\}))S' ),
  156. array(
  157. 'class' => 'ezcDocumentWikiFootnoteStartToken',
  158. 'match' => '(\\A(?P<value>\\(\\())S' ),
  159. array(
  160. 'class' => 'ezcDocumentWikiFootnoteEndToken',
  161. 'match' => '(\\A(?P<value>\\)\\)))S' ),
  162. array(
  163. 'class' => 'ezcDocumentWikiTableHeaderToken',
  164. 'match' => '(\\A(?P<value>\\^))S' ),
  165. array(
  166. 'class' => 'ezcDocumentWikiPluginToken',
  167. 'match' => '(\\A(?P<value><([a-zA-Z]+).*?</\\2>))Ss' ),
  168. // Match text except
  169. array(
  170. 'class' => 'ezcDocumentWikiTextLineToken',
  171. 'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),
  172. // Match all special characters, which are not valid textual chars,
  173. // but do not have been matched by any other expression.
  174. array(
  175. 'class' => 'ezcDocumentWikiSpecialCharsToken',
  176. 'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
  177. );
  178. }
  179. /**
  180. * Parse plugin contents
  181. *
  182. * Plugins are totally different in each wiki component and its contents
  183. * should not be passed through the normal wiki parser. So we fetch its
  184. * contents completely and let each tokinzer extract names and parameters
  185. * from the complete token itself.
  186. *
  187. * @param ezcDocumentWikiPluginToken $plugin
  188. * @return void
  189. */
  190. protected function parsePluginContents( ezcDocumentWikiPluginToken $plugin )
  191. {
  192. // Match name of plugin
  193. if ( preg_match( '(^\\s*<(?P<type>[a-zA-Z]+)(?:\\s+(?P<params>[^>]+))?>(?P<content>.*?)\\s*</\\1>\\s*)si', $plugin->content, $match ) )
  194. {
  195. $plugin->type = strtolower( $match['type'] );
  196. $plugin->parameters = isset( $match['params'] ) && $match['params'] ? array( $match['params'] ) : array();
  197. $plugin->text = $match['content'];
  198. }
  199. }
  200. /**
  201. * Filter tokens
  202. *
  203. * Method to filter tokens, after the input string ahs been tokenized. The
  204. * filter should extract additional information from tokens, which are not
  205. * generally available yet, like the depth of a title depending on the
  206. * title markup.
  207. *
  208. * @param array $tokens
  209. * @return array
  210. */
  211. protected function filterTokens( array $tokens )
  212. {
  213. $lastImageStartToken = null;
  214. foreach ( $tokens as $nr => $token )
  215. {
  216. switch ( true )
  217. {
  218. // Extract the title / indentation level from the tokens
  219. // length.
  220. case $token instanceof ezcDocumentWikiTitleToken:
  221. $token->level = 7 - strlen( trim( $token->content ) );
  222. break;
  223. case $token instanceof ezcDocumentWikiParagraphIndentationToken:
  224. $token->level = strlen( trim( $token->content ) );
  225. break;
  226. case $token instanceof ezcDocumentWikiImageStartToken:
  227. // Check if an alignement has been specified by whitespace
  228. // tokens.
  229. $lastImageStartToken = $token;
  230. if ( $tokens[$next = $nr + 1] instanceof ezcDocumentWikiWhitespaceToken )
  231. {
  232. $token->alignement = 'right';
  233. unset( $tokens[$nr + 1] );
  234. ++$next;
  235. }
  236. if ( preg_match( '(\\?(?P<width>\d+)(?:x(?P<height>\d+))?$)', $tokens[$next]->content, $match ) )
  237. {
  238. $tokens[$next]->content = substr( $tokens[$next]->content, 0, -strlen( $match[0] ) );
  239. $token->width = isset( $match['width'] ) ? (int) $match['width'] : null;
  240. $token->height = isset( $match['height'] ) ? (int) $match['height'] : null;
  241. }
  242. break;
  243. case $token instanceof ezcDocumentWikiImageEndToken:
  244. case $token instanceof ezcDocumentWikiSeparatorToken:
  245. // Check if an alignement has been specified by whitespace
  246. // tokens.
  247. if ( ( $tokens[$nr - 1] instanceof ezcDocumentWikiWhitespaceToken ) &&
  248. ( $lastImageStartToken !== null ) )
  249. {
  250. $lastImageStartToken->alignement = $lastImageStartToken->alignement === 'right' ? 'center' : 'left';
  251. unset( $tokens[$nr - 1] );
  252. }
  253. $lastImageStartToken = null;
  254. break;
  255. case $token instanceof ezcDocumentWikiBulletListItemToken:
  256. case $token instanceof ezcDocumentWikiEnumeratedListItemToken:
  257. $token->indentation = substr_count( $token->content, ' ' );
  258. break;
  259. case $token instanceof ezcDocumentWikiPluginToken:
  260. $this->parsePluginContents( $token );
  261. break;
  262. }
  263. }
  264. return $tokens;
  265. }
  266. }
  267. ?>