PageRenderTime 74ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/library/html2pdf/_class/parsingHtml.class.php

https://bitbucket.org/openemr/openemr
PHP | 545 lines | 372 code | 49 blank | 124 comment | 60 complexity | a80c59be4af3e77ddc4b387c5de591fa MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0, GPL-2.0, LGPL-3.0, BSD-3-Clause, Unlicense, MPL-2.0, GPL-3.0, LGPL-2.1
  1. <?php
  2. /**
  3. * HTML2PDF Library - parsingHtml class
  4. *
  5. * HTML => PDF convertor
  6. * distributed under the LGPL License
  7. *
  8. * @package Html2pdf
  9. * @author Laurent MINGUET <webmaster@html2pdf.fr>
  10. * @copyright 2016 Laurent MINGUET
  11. */
  12. class HTML2PDF_parsingHtml
  13. {
  14. protected $_html = ''; // HTML code to parse
  15. protected $_num = 0; // table number
  16. protected $_level = 0; // table level
  17. protected $_encoding = ''; // encoding
  18. public $code = array(); // parsed HTML code
  19. const HTML_TAB = ' ';
  20. /**
  21. * main constructor
  22. *
  23. * @param string $encoding
  24. * @access public
  25. */
  26. public function __construct($encoding = 'UTF-8')
  27. {
  28. $this->_num = 0;
  29. $this->_level = array($this->_num);
  30. $this->_html = '';
  31. $this->code = array();
  32. $this->setEncoding($encoding);
  33. }
  34. /**
  35. * change the encoding
  36. *
  37. * @param string $encoding
  38. * @access public
  39. */
  40. public function setEncoding($encoding)
  41. {
  42. $this->_encoding = $encoding;
  43. }
  44. /**
  45. * Define the HTML code to parse
  46. *
  47. * @param string $html code
  48. * @access public
  49. */
  50. public function setHTML($html)
  51. {
  52. // remove the HTML in comment
  53. $html = preg_replace('/<!--(.*)-->/isU', '', $html);
  54. // save the HTML code
  55. $this->_html = $html;
  56. }
  57. /**
  58. * parse the HTML code
  59. *
  60. * @access public
  61. */
  62. public function parse()
  63. {
  64. $parents = array();
  65. // flag : are we in a <pre> Tag ?
  66. $tagPreIn = false;
  67. // action to use for each line of the content of a <pre> Tag
  68. $tagPreBr = array(
  69. 'name' => 'br',
  70. 'close' => false,
  71. 'param' => array(
  72. 'style' => array(),
  73. 'num' => 0
  74. )
  75. );
  76. // tag that can be not closed
  77. $tagsNotClosed = array(
  78. 'br', 'hr', 'img', 'col',
  79. 'input', 'link', 'option',
  80. 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
  81. );
  82. // search the HTML tags
  83. $parts = $this->_searchCode();
  84. // all the actions to do
  85. $actions = array();
  86. // foreach part of the HTML code
  87. foreach ($parts as $part) {
  88. // if it is a tag code
  89. if ($part[0] == 'code') {
  90. // analyze the HTML code
  91. $res = $this->_analyzeCode($part[1]);
  92. // if it is a real HTML tag
  93. if ($res) {
  94. // save the current position in the HTML code
  95. $res['html_pos'] = $part[2];
  96. // if the tag must be closed
  97. if (!in_array($res['name'], $tagsNotClosed)) {
  98. // if it is a closure tag
  99. if ($res['close']) {
  100. // HTML validation
  101. if (count($parents) < 1) {
  102. throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
  103. } else if (end($parents) != $res['name']) {
  104. throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
  105. } else {
  106. array_pop($parents);
  107. }
  108. } else {
  109. // if it is an auto-closed tag
  110. if ($res['autoclose']) {
  111. // save the opened tag
  112. $actions[] = $res;
  113. // prepare the closed tag
  114. $res['params'] = array();
  115. $res['close'] = true;
  116. } else {
  117. // else: add a child for validation
  118. array_push($parents, $res['name']);
  119. }
  120. }
  121. // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
  122. if (($res['name'] == 'pre' || $res['name'] == 'code') && !$res['autoclose']) {
  123. $tagPreIn = !$res['close'];
  124. }
  125. }
  126. // save the actions to convert
  127. $actions[] = $res;
  128. } else { // else (it is not a real HTML tag => we transform it in Text
  129. $part[0] = 'txt';
  130. }
  131. }
  132. // if it is text
  133. if ($part[0] == 'txt') {
  134. // if we are not in a <pre> tag
  135. if (!$tagPreIn) {
  136. // save the action
  137. $actions[] = array(
  138. 'name' => 'write',
  139. 'close' => false,
  140. 'param' => array('txt' => $this->_prepareTxt($part[1])),
  141. );
  142. } else { // else (if we are in a <pre> tag)
  143. // prepare the text
  144. $part[1] = str_replace("\r", '', $part[1]);
  145. $part[1] = explode("\n", $part[1]);
  146. // foreach line of the text
  147. foreach ($part[1] as $k => $txt) {
  148. // transform the line
  149. $txt = str_replace("\t", self::HTML_TAB, $txt);
  150. $txt = str_replace(' ', '&nbsp;', $txt);
  151. // add a break line
  152. if ($k > 0) {
  153. $actions[] = $tagPreBr;
  154. }
  155. // save the action
  156. $actions[] = array(
  157. 'name' => 'write',
  158. 'close' => false,
  159. 'param' => array('txt' => $this->_prepareTxt($txt, false)),
  160. );
  161. }
  162. }
  163. }
  164. }
  165. // for each identified action, we have to clean up the begin and the end of the texte
  166. // based on tags that surround it
  167. // list of the tags to clean
  168. $tagsToClean = array(
  169. 'page', 'page_header', 'page_footer', 'form',
  170. 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
  171. 'div', 'hr', 'p', 'ul', 'ol', 'li',
  172. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  173. 'bookmark', 'fieldset', 'legend',
  174. 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
  175. 'option'
  176. );
  177. // foreach action
  178. $nb = count($actions);
  179. for ($k = 0; $k < $nb; $k++) {
  180. // if it is a Text
  181. if ($actions[$k]['name']=='write') {
  182. // if the tag before the text is a tag to clean => ltrim on the text
  183. if ($k>0 && in_array($actions[$k - 1]['name'], $tagsToClean))
  184. $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
  185. // if the tag after the text is a tag to clean => rtrim on the text
  186. if ($k < $nb - 1 && in_array($actions[$k + 1]['name'], $tagsToClean))
  187. $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
  188. // if the text is empty => remove the action
  189. if (!strlen($actions[$k]['param']['txt'])) {
  190. unset($actions[$k]);
  191. }
  192. }
  193. }
  194. // if we are not on the level 0 => HTML validator ERROR
  195. if (count($parents)) {
  196. throw new HTML2PDF_exception(5, $parents);
  197. }
  198. // save the actions to do
  199. $this->code = array_values($actions);
  200. }
  201. /**
  202. * prepare the text
  203. *
  204. * @param string $txt
  205. * @param boolean $spaces true => replace multiple space+\t+\r+\n by a single space
  206. * @return string txt
  207. * @access protected
  208. */
  209. protected function _prepareTxt($txt, $spaces = true)
  210. {
  211. if ($spaces) $txt = preg_replace('/\s+/isu', ' ', $txt);
  212. $txt = str_replace('&euro;', '€', $txt);
  213. $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
  214. return $txt;
  215. }
  216. /**
  217. * parse the HTML code
  218. *
  219. * @return array
  220. */
  221. protected function _searchCode()
  222. {
  223. // initialise the array
  224. $parts = array();
  225. // regexp to separate the tags from the texts
  226. $reg = '/(<[^>]+>)|([^<]+)+/isU';
  227. // last match found
  228. $str = '';
  229. $offset = 0;
  230. // As it finds a match
  231. while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
  232. // if it is a tag
  233. if ($parse[1][0]) {
  234. // save the previous text if it exists
  235. if ($str !== '') {
  236. $parts[] = array('txt', $str);
  237. }
  238. // save the tag, with the offset
  239. $parts[] = array('code', trim($parse[1][0]), $offset);
  240. // init the current text
  241. $str = '';
  242. } else { // else (if it is a text)
  243. // add the new text to the current text
  244. $str .= $parse[2][0];
  245. }
  246. // Update offset to the end of the match
  247. $offset = $parse[0][1] + strlen($parse[0][0]);
  248. unset($parse);
  249. }
  250. // if a text is present in the end, we save it
  251. if ($str != '') {
  252. $parts[] = array('txt', $str);
  253. }
  254. return $parts;
  255. }
  256. /**
  257. * analise a HTML tag
  258. *
  259. * @param string $code HTML code to analise
  260. * @return array corresponding action
  261. */
  262. protected function _analyzeCode($code)
  263. {
  264. // name of the tag, opening, closure, autoclosure
  265. $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
  266. if (!preg_match('/'.$tag.'/isU', $code, $match)) {
  267. return null;
  268. }
  269. $close = ($match[1] == '/' ? true : false);
  270. $autoclose = preg_match('/\/>$/isU', $code);
  271. $name = strtolower($match[2]);
  272. // required parameters (depends on the tag name)
  273. $param = array();
  274. $param['style'] = '';
  275. if ($name == 'img') {
  276. $param['alt'] = '';
  277. $param['src'] = '';
  278. }
  279. if ($name == 'a') {
  280. $param['href'] = '';
  281. }
  282. // read the parameters : name=value
  283. $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
  284. preg_match_all('/'.$prop.'/is', $code, $match);
  285. for ($k = 0; $k < count($match[0]); $k++) {
  286. $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
  287. }
  288. // read the parameters : name="value"
  289. $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
  290. preg_match_all('/'.$prop.'/is', $code, $match);
  291. for ($k = 0; $k < count($match[0]); $k++) {
  292. $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
  293. }
  294. // read the parameters : name='value'
  295. $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
  296. preg_match_all('/'.$prop.'/is', $code, $match);
  297. for ($k = 0; $k < count($match[0]); $k++) {
  298. $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
  299. }
  300. // compliance of each parameter
  301. $color = "#000000";
  302. $border = null;
  303. foreach ($param as $key => $val) {
  304. $key = strtolower($key);
  305. switch($key)
  306. {
  307. case 'width':
  308. unset($param[$key]);
  309. $param['style'] .= 'width: '.$val.'px; ';
  310. break;
  311. case 'align':
  312. if ($name === 'img') {
  313. unset($param[$key]);
  314. $param['style'] .= 'float: '.$val.'; ';
  315. } elseif ($name !== 'table') {
  316. unset($param[$key]);
  317. $param['style'] .= 'text-align: '.$val.'; ';
  318. }
  319. break;
  320. case 'valign':
  321. unset($param[$key]);
  322. $param['style'] .= 'vertical-align: '.$val.'; ';
  323. break;
  324. case 'height':
  325. unset($param[$key]);
  326. $param['style'] .= 'height: '.$val.'px; ';
  327. break;
  328. case 'bgcolor':
  329. unset($param[$key]);
  330. $param['style'] .= 'background: '.$val.'; ';
  331. break;
  332. case 'bordercolor':
  333. unset($param[$key]);
  334. $color = $val;
  335. break;
  336. case 'border':
  337. unset($param[$key]);
  338. if (preg_match('/^[0-9]+$/isU', $val)) {
  339. $val = $val.'px';
  340. }
  341. $border = $val;
  342. break;
  343. case 'cellpadding':
  344. case 'cellspacing':
  345. if (preg_match('/^([0-9]+)$/isU', $val)) {
  346. $param[$key] = $val.'px';
  347. }
  348. break;
  349. case 'colspan':
  350. case 'rowspan':
  351. $val = preg_replace('/[^0-9]/isU', '', $val);
  352. if (!$val) {
  353. $val = 1;
  354. }
  355. $param[$key] = $val;
  356. break;
  357. }
  358. }
  359. // compliance of the border
  360. if ($border !== null) {
  361. if ($border) $border = 'border: solid '.$border.' '.$color;
  362. else $border = 'border: none';
  363. $param['style'] .= $border.'; ';
  364. $param['border'] = $border;
  365. }
  366. // reading styles: decomposition and standardization
  367. $styles = explode(';', $param['style']);
  368. $param['style'] = array();
  369. foreach ($styles as $style) {
  370. $tmp = explode(':', $style);
  371. if (count($tmp) > 1) {
  372. $cod = $tmp[0];
  373. unset($tmp[0]);
  374. $tmp = implode(':', $tmp);
  375. $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
  376. }
  377. }
  378. // determining the level of table opening, with an added level
  379. if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
  380. $this->_num++;
  381. $this->_level[count($this->_level)] = $this->_num;
  382. }
  383. // get the level of the table containing the element
  384. if (!isset($param['num'])) {
  385. $param['num'] = $this->_level[count($this->_level) - 1];
  386. }
  387. // for closures table: remove a level
  388. if (in_array($name, array('ul', 'ol', 'table')) && $close) {
  389. unset($this->_level[count($this->_level) - 1]);
  390. }
  391. // prepare the parameters
  392. if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
  393. if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
  394. if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
  395. if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
  396. // return the new action to do
  397. return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
  398. }
  399. /**
  400. * get a full level of HTML, between an opening and closing corresponding
  401. *
  402. * @param integer $k
  403. * @return array actions
  404. */
  405. public function getLevel($k)
  406. {
  407. // if the code does not exist => return empty
  408. if (!isset($this->code[$k])) {
  409. return array();
  410. }
  411. // the tag to detect
  412. $detect = $this->code[$k]['name'];
  413. // if it is a text => return
  414. if ($detect == 'write') {
  415. return array($this->code[$k]);
  416. }
  417. //
  418. $level = 0; // depth level
  419. $end = false; // end of the search
  420. $code = array(); // extract code
  421. // while it's not ended
  422. while (!$end) {
  423. // current action
  424. $row = $this->code[$k];
  425. // if 'write' => we add the text
  426. if ($row['name']=='write') {
  427. $code[] = $row;
  428. } else { // else, it is a html tag
  429. $not = false; // flag for not taking into account the current tag
  430. // if it is the searched tag
  431. if ($row['name'] == $detect) {
  432. // if we are just at the root level => dont take it
  433. if ($level == 0) {
  434. $not = true;
  435. }
  436. // update the level
  437. $level+= ($row['close'] ? -1 : 1);
  438. // if we are now at the root level => it is the end, and dont take it
  439. if ($level == 0) {
  440. $not = true;
  441. $end = true;
  442. }
  443. }
  444. // if we can take into account the current tag => save it
  445. if (!$not) {
  446. if (isset($row['style']['text-align'])) {
  447. unset($row['style']['text-align']);
  448. }
  449. $code[] = $row;
  450. }
  451. }
  452. // it continues as long as there has code to analyze
  453. if (isset($this->code[$k + 1])) {
  454. $k++;
  455. } else {
  456. $end = true;
  457. }
  458. }
  459. // return the extract
  460. return $code;
  461. }
  462. /**
  463. * return a part of the HTML code, for error message
  464. *
  465. * @param integer $pos
  466. * @param integer $before take before
  467. * @param integer $after take after
  468. * @return string part of the html code
  469. */
  470. public function getHtmlErrorCode($pos, $before=30, $after=40)
  471. {
  472. return substr($this->_html, $pos-$before, $before+$after);
  473. }
  474. }