PageRenderTime 57ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/htmlizer.php

https://github.com/emohamed/htmlizer
PHP | 330 lines | 294 code | 33 blank | 3 comment | 11 complexity | 420ce7d67c2a65179e9c351e932054c5 MD5 | raw file
  1. <?php
  2. class Htmlizer {
  3. public $plain_text, $result_html;
  4. protected $filters = array(
  5. 'html_special_chars',
  6. array('code_blocks', 'before_filter'),
  7. array('inline_code', 'before_filter'),
  8. 'header',
  9. 'bold',
  10. 'super_script',
  11. 'sub_script',
  12. 'link',
  13. 'list',
  14. 'auto_p',
  15. 'code_blocks',
  16. 'inline_code',
  17. );
  18. function htmlize($plain_text) {
  19. $return = $plain_text;
  20. foreach ($this->filters as $filter) {
  21. $callback = Htmlizer_Filter::factory($filter);
  22. if (!is_callable($callback)) {
  23. throw new Exception("Invalid callback");
  24. }
  25. $return = call_user_func($callback, $return);
  26. }
  27. return $return;
  28. }
  29. }
  30. abstract class Htmlizer_Filter {
  31. static $classes = array();
  32. protected $start_token,
  33. $end_token,
  34. $replaced_start,
  35. $replaced_end;
  36. # particular elements like headings should be the only text on the line
  37. protected $whole_line_only = false;
  38. protected $block_element = false;
  39. static function factory($filter) {
  40. $method = 'process';
  41. if (is_array($filter)) {
  42. if(count($filter)==2) {
  43. list($filter_class, $method) = array_values($filter);
  44. $filter = $filter_class;
  45. } else {
  46. $error = "Cannot parse filter " . print_r($filter, 1);
  47. throw new Htmlizer_Filter_Exception($error);
  48. }
  49. }
  50. if (isset(self::$classes[$filter])) {
  51. return array(self::$classes[$filter], $method);
  52. }
  53. $filter_class = str_replace(' ', '', ucwords(str_replace('_', ' ', $filter)));
  54. $filter_class = 'Htmlizer_Filter_' . $filter_class;
  55. if (!class_exists($filter_class)) {
  56. $error = "Unknow filter $filter";
  57. throw new Htmlizer_Filter_Exception($error);
  58. }
  59. $filter_object = new $filter_class();
  60. self::$classes[$filter] = $filter_object;
  61. return array($filter_object, $method);
  62. }
  63. function build_regex() {
  64. $start_token = preg_quote($this->start_token);
  65. $end_token = preg_quote($this->end_token);
  66. $is_greedy = true;
  67. if ($this->whole_line_only) {
  68. $is_greedy = false;
  69. }
  70. $flags = array();
  71. if (empty($start_token)) {
  72. $error = "Cannot determinate token for " . get_class($this) . " inline filter. ";
  73. throw new Htmlizer_Filter_Exception($error);
  74. }
  75. $regex_core = $start_token . '(.+' . ($is_greedy ? '?' : '') . ')' . $end_token;
  76. $regex_separator = '~';
  77. if ($this->whole_line_only) {
  78. # allow whitespace
  79. $regex_core = "^\s*$regex_core\s*$";
  80. $flags[] = "m"; # multi line so start of line and end of line works correctly
  81. } elseif ($this->block_element) {
  82. $flags[] = "s"; # dot all
  83. }
  84. # escape the regex separator in the regex core -- that way "~" token won't fail
  85. $this->regex = $regex_separator .
  86. str_replace($regex_separator, '\\' . $regex_separator, $regex_core) .
  87. $regex_separator .
  88. implode('', $flags);
  89. }
  90. function process($plain_text) {
  91. $this->build_regex();
  92. $replaced = preg_replace(
  93. $this->regex,
  94. $this->replaced_start . '$1' . $this->replaced_end,
  95. $plain_text
  96. );
  97. return $replaced;
  98. }
  99. }
  100. class Htmlizer_Filter_Exception extends Exception {}
  101. class Htmlizer_Filter_HtmlSpecialChars extends Htmlizer_Filter {
  102. function process($filter) {
  103. return htmlspecialchars($filter, ENT_NOQUOTES);
  104. }
  105. }
  106. class Htmlizer_Filter_AutoP extends Htmlizer_Filter {
  107. /**
  108. * Accepts matches array from preg_replace_callback in wpautop() or a string.
  109. *
  110. * Ensures that the contents of a <<pre>>...<</pre>> HTML block are not
  111. * converted into paragraphs or line-breaks.
  112. *
  113. * @param array|string $matches The array or string
  114. * @return string The pre block without paragraph/line-break conversion.
  115. */
  116. function clean_pre($matches) {
  117. if ( is_array($matches) )
  118. $text = $matches[1] . $matches[2] . "</pre>";
  119. else
  120. $text = $matches;
  121. $text = str_replace('<br />', '', $text);
  122. $text = str_replace('<p>', "\n", $text);
  123. $text = str_replace('</p>', '', $text);
  124. return $text;
  125. }
  126. // Borrowed from WordPress
  127. function process($pee) {
  128. if ( trim($pee) === '' )
  129. return '';
  130. $pee = $pee . "\n"; // just to make things a little easier, pad the end
  131. $pee = preg_replace('|<br />\s*<br />|', "\n\n", $pee);
  132. // Space things out a little
  133. $allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)';
  134. $pee = preg_replace('!(<' . $allblocks . '[^>]*>)!', "\n$1", $pee);
  135. $pee = preg_replace('!(</' . $allblocks . '>)!', "$1\n\n", $pee);
  136. $pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
  137. if ( strpos($pee, '<object') !== false ) {
  138. $pee = preg_replace('|\s*<param([^>]*)>\s*|', "<param$1>", $pee); // no pee inside object/embed
  139. $pee = preg_replace('|\s*</embed>\s*|', '</embed>', $pee);
  140. }
  141. $pee = preg_replace("/\n\n+/", "\n\n", $pee); // take care of duplicates
  142. // make paragraphs, including one at the end
  143. $pees = preg_split('/\n\s*\n/', $pee, -1, PREG_SPLIT_NO_EMPTY);
  144. $pee = '';
  145. foreach ( $pees as $tinkle )
  146. $pee .= '<p>' . trim($tinkle, "\n") . "</p>\n";
  147. $pee = preg_replace('|<p>\s*</p>|', '', $pee); // under certain strange conditions it could create a P of entirely whitespace
  148. $pee = preg_replace('!<p>([^<]+)</(div|address|form)>!', "<p>$1</p></$2>", $pee);
  149. $pee = preg_replace('!<p>\s*(</?' . $allblocks . '[^>]*>)\s*</p>!', "$1", $pee); // don't pee all over a tag
  150. $pee = preg_replace("|<p>(<li.+?)</p>|", "$1", $pee); // problem with nested lists
  151. $pee = preg_replace('|<p><blockquote([^>]*)>|i', "<blockquote$1><p>", $pee);
  152. $pee = str_replace('</blockquote></p>', '</p></blockquote>', $pee);
  153. $pee = preg_replace('!<p>\s*(</?' . $allblocks . '[^>]*>)!', "$1", $pee);
  154. $pee = preg_replace('!(</?' . $allblocks . '[^>]*>)\s*</p>!', "$1", $pee);
  155. $pee = preg_replace('!(</?' . $allblocks . '[^>]*>)\s*<br />!', "$1", $pee);
  156. $pee = preg_replace('!<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)!', '$1', $pee);
  157. if (strpos($pee, '<pre') !== false)
  158. $pee = preg_replace_callback('!(<pre[^>]*>)(.*?)</pre>!is', 'clean_pre', $pee );
  159. $pee = preg_replace( "|\n</p>$|", '</p>', $pee );
  160. $pee = preg_replace( "|\s+$|", '', $pee );
  161. return $pee;
  162. }
  163. }
  164. class Htmlizer_Filter_CodeBlocks extends Htmlizer_Filter {
  165. public $start_token = '{{{', $end_token = '}}}',
  166. $replaced_start = '<div class="code">', $replaced_end = '</div>';
  167. protected $block_element=true;
  168. public $code_blocks_references = array();
  169. function initial_replace_callback($matches) {
  170. $code = $matches[1];
  171. $code_block_id = md5(mt_rand() . $code);
  172. $this->code_blocks_references[$code_block_id] = $code;
  173. return $code_block_id;
  174. }
  175. function before_filter($plain_text) {
  176. $this->build_regex();
  177. return preg_replace_callback($this->regex, array($this, 'initial_replace_callback'), $plain_text);
  178. }
  179. function process($plain_text) {
  180. foreach ($this->code_blocks_references as $code_block_id=>$code) {
  181. $plain_text = str_replace(
  182. $code_block_id,
  183. $this->replaced_start . $code . $this->replaced_end,
  184. $plain_text
  185. );
  186. }
  187. return $plain_text;
  188. }
  189. }
  190. class Htmlizer_Filter_InlineCode extends Htmlizer_Filter_CodeBlocks {
  191. public $start_token = '`', $end_token = '`',
  192. $replaced_start = '<code>', $replaced_end = '</code>';
  193. protected $block_element=true;
  194. }
  195. class Htmlizer_Filter_Inline extends Htmlizer_Filter {
  196. }
  197. class Htmlizer_Filter_List extends Htmlizer_Filter {
  198. function do_list_elements($matches) {
  199. return '<li>' . $matches[1] . '</li>';
  200. }
  201. function do_ol_list_elements($matches) {
  202. return '<oli>' . $matches[1] . '</oli>';
  203. }
  204. function do_unordered_lists($matches) {
  205. return "\n" . '<ul>' . trim($matches[0]) . '</ul>' . "\n";
  206. }
  207. function do_ordered_lists($matches) {
  208. return '<ol>' . trim(str_replace(
  209. array('<oli>', '</oli>'),
  210. array('<li>', '</li>'),
  211. $matches[0]
  212. )) . '</ol>' . "\n";
  213. }
  214. function process($html) {
  215. $html = preg_replace_callback('~\s*^\s*[\-\*•·•] ?(.*?)$~um', array($this, 'do_list_elements'), $html);
  216. $html = preg_replace_callback('~(<li>.*?</li>\s*)+~', array($this, 'do_unordered_lists'), $html);
  217. $html = preg_replace_callback('~^\s*# ?(.*?)$~um', array($this, 'do_ol_list_elements'), $html);
  218. $html = preg_replace_callback('~(<oli>.*?</oli>\s*)+~', array($this, 'do_ordered_lists'), $html);
  219. return $html;
  220. }
  221. }
  222. class Htmlizer_Filter_Link extends Htmlizer_Filter {
  223. protected $link_regex = '~((file:|mailto\:|(news|(ht|f)tp(s?))\://){1}[^\*\s"\'\[\]]+)~';
  224. function replace_urls($matches) {
  225. $link = html_entity_decode(str_replace('\\', '/', $matches[0]));
  226. # handle links with parenthese properly
  227. $rest = '';
  228. if (strpos($link, ')')!==false && strpos($link, '(')===false) {
  229. $rest = substr($link, strpos($link, ')'));
  230. $link = substr($link, 0, strpos($link, ')'));
  231. } else if (preg_match('~([\.",])$~', $link, $m)) {
  232. # dots at the end of the string are really not part of the url(in most cases)
  233. $link = substr($link, 0, -1);
  234. $rest = $m[1];
  235. }
  236. $link_location = $link;
  237. // $link_repr = preg_replace('~([%/\?=:])~', '$1<wbr></wbr>', $link);
  238. $link_repr = $link;
  239. $link_repr = wordwrap($link_repr, 120, '<wbr></wbr>', 1);
  240. return '<a href="' . $link_location . '" target="_blank">' . $link_repr . '</a>' . $rest;
  241. }
  242. function replace_embed_links($matches) {
  243. $link_text = $matches[1];
  244. $link_location = $matches[2];
  245. if (preg_match($this->link_regex, $link_text)) {
  246. return $link_text;
  247. }
  248. return '<a href="' . $link_location . '" target="_blank">' . $link_text . '</a>';
  249. }
  250. function process($html) {
  251. /*
  252. $html = preg_replace_callback(
  253. '~\[([^\]]*?) (.*?)\]~',
  254. array($this, 'replace_embed_links'),
  255. $html
  256. );
  257. */
  258. $html = preg_replace_callback(
  259. $this->link_regex,
  260. array($this, 'replace_urls'),
  261. $html
  262. );
  263. return $html;
  264. }
  265. }
  266. class Htmlizer_Filter_Bold extends Htmlizer_Filter_Inline {
  267. protected $start_token = '*', $end_token = '*',
  268. $replaced_start = '<strong>',
  269. $replaced_end = '</strong>';
  270. }
  271. class Htmlizer_Filter_Header extends Htmlizer_Filter_Inline {
  272. protected $start_token = '=', $end_token = '=',
  273. $replaced_start = '<h2>',
  274. $replaced_end = '</h2>';
  275. protected $whole_line_only = true;
  276. }
  277. class Htmlizer_Filter_SuperScript extends Htmlizer_Filter_Inline{
  278. protected $start_token = '^', $end_token = '^',
  279. $replaced_start = '<sup>',
  280. $replaced_end = '</sup>';
  281. }
  282. class Htmlizer_Filter_SubScript extends Htmlizer_Filter_Inline{
  283. protected $start_token = '~', $end_token = '~',
  284. $replaced_start = '<sub>',
  285. $replaced_end = '</sub>';
  286. }
  287. ?>