PageRenderTime 54ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/include/html2pdf_v4.03/_class/parsingHtml.class.php

https://bitbucket.org/sleininger/stock_online
PHP | 520 lines | 347 code | 49 blank | 124 comment | 45 complexity | a3159c8c5a8677f8f7aeb4b396439c3b MD5 | raw file
Possible License(s): LGPL-3.0, LGPL-2.1, GPL-3.0
  1. <?php
  2. /**
  3. * HTML2PDF Librairy - parsingHtml class
  4. *
  5. * HTML => PDF convertor
  6. * distributed under the LGPL License
  7. *
  8. * @author Laurent MINGUET <webmaster@html2pdf.fr>
  9. * @version 4.03
  10. */
  11. class HTML2PDF_parsingHtml
  12. {
  13. protected $_html = ''; // HTML code to parse
  14. protected $_num = 0; // table number
  15. protected $_level = 0; // table level
  16. protected $_encoding = ''; // encoding
  17. public $code = array(); // parsed HTML codfe
  18. const HTML_TAB = ' ';
  19. /**
  20. * main constructor
  21. *
  22. * @param string encoding
  23. * @access public
  24. */
  25. public function __construct($encoding = 'UTF-8')
  26. {
  27. $this->_num = 0;
  28. $this->_level = array($this->_num);
  29. $this->_html = '';
  30. $this->code = array();
  31. $this->setEncoding($encoding);
  32. }
  33. /**
  34. * change the encoding
  35. *
  36. * @param string encoding
  37. * @access public
  38. */
  39. public function setEncoding($encoding)
  40. {
  41. $this->_encoding = $encoding;
  42. }
  43. /**
  44. * Define the HTML code to parse
  45. *
  46. * @param string HTML code
  47. * @access public
  48. */
  49. public function setHTML($html)
  50. {
  51. // remove the HTML in comment
  52. $html = preg_replace('/<!--(.*)-->/isU', '', $html);
  53. // save the HTML code
  54. $this->_html = $html;
  55. }
  56. /**
  57. * parse the HTML code
  58. *
  59. * @access public
  60. */
  61. public function parse()
  62. {
  63. $parents = array();
  64. // flag : are we in a <pre> Tag ?
  65. $tagPreIn = false;
  66. // action to use for each line of the content of a <pre> Tag
  67. $tagPreBr = array(
  68. 'name' => 'br',
  69. 'close' => false,
  70. 'param' => array(
  71. 'style' => array(),
  72. 'num' => 0
  73. )
  74. );
  75. // tag that can be not closed
  76. $tagsNotClosed = array(
  77. 'br', 'hr', 'img', 'col',
  78. 'input', 'link', 'option',
  79. 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
  80. );
  81. // search the HTML tags
  82. $tmp = array();
  83. $this->_searchCode($tmp);
  84. // all the actions to do
  85. $actions = array();
  86. // foreach part of the HTML code
  87. foreach ($tmp as $part) {
  88. // if it is a tag code
  89. if ($part[0]=='code') {
  90. // analise the HTML code
  91. $res = $this->_analiseCode($part[1]);
  92. // if it is a real HTML tag
  93. if ($res) {
  94. // save the current posistion in the HTML code
  95. $res['html_pos'] = $part[2];
  96. // if the tag must be closed
  97. if (!in_array($res['name'], $tagsNotClosed)) {
  98. // if it is a closure tag
  99. if ($res['close']) {
  100. // HTML validation
  101. if (count($parents)<1)
  102. throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
  103. else if ($parents[count($parents)-1]!=$res['name'])
  104. throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
  105. else
  106. unset($parents[count($parents)-1]);
  107. } else {
  108. // if it is a autoclosed tag
  109. if ($res['autoclose']) {
  110. // save the opened tag
  111. $actions[] = $res;
  112. // prepare the closed tag
  113. $res['params'] = array();
  114. $res['close'] = true;
  115. }
  116. // else :add a child for validation
  117. else
  118. $parents[count($parents)] = $res['name'];
  119. }
  120. // if it is a <pre> tag (or <code> tag) not auclosed => update the flag
  121. if (($res['name']=='pre' || $res['name']=='code') && !$res['autoclose']) {
  122. $tagPreIn = !$res['close'];
  123. }
  124. }
  125. // save the actions to convert
  126. $actions[] = $res;
  127. } else { // else (it is not a real HTML tag => we transform it in Texte
  128. $part[0]='txt';
  129. }
  130. }
  131. // if it is text
  132. if ($part[0]=='txt') {
  133. // if we are not in a <pre> tag
  134. if (!$tagPreIn) {
  135. // save the action
  136. $actions[] = array(
  137. 'name' => 'write',
  138. 'close' => false,
  139. 'param' => array('txt' => $this->_prepareTxt($part[1])),
  140. );
  141. } else { // else (if we are in a <pre> tag)
  142. // prepare the text
  143. $part[1] = str_replace("\r", '', $part[1]);
  144. $part[1] = explode("\n", $part[1]);
  145. // foreach line of the text
  146. foreach ($part[1] as $k => $txt) {
  147. // transform the line
  148. $txt = str_replace("\t", self::HTML_TAB, $txt);
  149. $txt = str_replace(' ', '&nbsp;', $txt);
  150. // add a break line
  151. if ($k>0) $actions[] = $tagPreBr;
  152. // save the action
  153. $actions[] = array(
  154. 'name' => 'write',
  155. 'close' => false,
  156. 'param' => array('txt' => $this->_prepareTxt($txt, false)),
  157. );
  158. }
  159. }
  160. }
  161. }
  162. // for each indentified action, we have to clean up the begin and the end of the texte
  163. // based on tags that surround it
  164. // list of the tags to clean
  165. $tagsToClean = array(
  166. 'page', 'page_header', 'page_footer', 'form',
  167. 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
  168. 'div', 'hr', 'p', 'ul', 'ol', 'li',
  169. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  170. 'bookmark', 'fieldset', 'legend',
  171. 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
  172. 'option'
  173. );
  174. // foreach action
  175. $nb = count($actions);
  176. for ($k=0; $k<$nb; $k++) {
  177. // if it is a Text
  178. if ($actions[$k]['name']=='write') {
  179. // if the tag before the text is a tag to clean => ltrim on the text
  180. if ($k>0 && in_array($actions[$k-1]['name'], $tagsToClean))
  181. $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
  182. // if the tag after the text is a tag to clean => rtrim on the text
  183. if ($k<$nb-1 && in_array($actions[$k+1]['name'], $tagsToClean))
  184. $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
  185. // if the text is empty => remove the action
  186. if (!strlen($actions[$k]['param']['txt']))
  187. unset($actions[$k]);
  188. }
  189. }
  190. // if we are not on the level 0 => HTML validator ERROR
  191. if (count($parents)) throw new HTML2PDF_exception(5, $parents);
  192. // save the actions to do
  193. $this->code = array_values($actions);
  194. }
  195. /**
  196. * prepare the text
  197. *
  198. * @param string texte
  199. * @param boolean true => replace multiple space+\t+\r+\n by a single space
  200. * @return string texte
  201. * @access protected
  202. */
  203. protected function _prepareTxt($txt, $spaces = true)
  204. {
  205. if ($spaces) $txt = preg_replace('/\s+/is', ' ', $txt);
  206. $txt = str_replace('&euro;', '€', $txt);
  207. $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
  208. return $txt;
  209. }
  210. /**
  211. * parse the HTML code
  212. *
  213. * @param &array array's result
  214. * @return null
  215. */
  216. protected function _searchCode(&$tmp)
  217. {
  218. // initialise the array
  219. $tmp = array();
  220. // regexp to separate the tags from the texts
  221. $reg = '/(<[^>]+>)|([^<]+)+/isU';
  222. // last match found
  223. $str = '';
  224. $offset = 0;
  225. // As it finds a match
  226. while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
  227. // if it is a tag
  228. if ($parse[1][0]) {
  229. // save the previous text if it exists
  230. if ($str!=='') $tmp[] = array('txt', $str);
  231. // save the tag, with the offset
  232. $tmp[] = array('code', trim($parse[1][0]), $offset);
  233. // init the current text
  234. $str = '';
  235. } else { // else (if it is a text)
  236. // add the new text to the current text
  237. $str.= $parse[2][0];
  238. }
  239. // Update offset to the end of the match
  240. $offset = $parse[0][1] + strlen($parse[0][0]);
  241. unset($parse);
  242. }
  243. // if a text is present in the end, we save it
  244. if ($str!='') $tmp[] = array('txt', $str);
  245. unset($str);
  246. }
  247. /**
  248. * analise a HTML tag
  249. *
  250. * @param string HTML code to analise
  251. * @return array corresponding action
  252. */
  253. protected function _analiseCode($code)
  254. {
  255. // name of the tag, opening, closure, autoclosure
  256. $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
  257. if (!preg_match('/'.$tag.'/isU', $code, $match)) return null;
  258. $close = ($match[1]=='/' ? true : false);
  259. $autoclose = preg_match('/\/>$/isU', $code);
  260. $name = strtolower($match[2]);
  261. // required parameters (depends on the tag name)
  262. $param = array();
  263. $param['style'] = '';
  264. if ($name=='img') {
  265. $param['alt'] = '';
  266. $param['src'] = '';
  267. }
  268. if ($name=='a') {
  269. $param['href'] = '';
  270. }
  271. // read the parameters : nom=valeur
  272. $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
  273. preg_match_all('/'.$prop.'/is', $code, $match);
  274. for($k=0; $k<count($match[0]); $k++)
  275. $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
  276. // read the parameters : nom="valeur"
  277. $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
  278. preg_match_all('/'.$prop.'/is', $code, $match);
  279. for($k=0; $k<count($match[0]); $k++)
  280. $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
  281. // read the parameters : nom='valeur'
  282. $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
  283. preg_match_all('/'.$prop.'/is', $code, $match);
  284. for($k=0; $k<count($match[0]); $k++)
  285. $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
  286. // compliance of each parameter
  287. $color = "#000000";
  288. $border = null;
  289. foreach ($param as $key => $val) {
  290. $key = strtolower($key);
  291. switch($key)
  292. {
  293. case 'width':
  294. unset($param[$key]);
  295. $param['style'] .= 'width: '.$val.'px; ';
  296. break;
  297. case 'align':
  298. if ($name==='img') {
  299. unset($param[$key]);
  300. $param['style'] .= 'float: '.$val.'; ';
  301. } elseif ($name!=='table') {
  302. unset($param[$key]);
  303. $param['style'] .= 'text-align: '.$val.'; ';
  304. }
  305. break;
  306. case 'valign':
  307. unset($param[$key]);
  308. $param['style'] .= 'vertical-align: '.$val.'; ';
  309. break;
  310. case 'height':
  311. unset($param[$key]);
  312. $param['style'] .= 'height: '.$val.'px; ';
  313. break;
  314. case 'bgcolor':
  315. unset($param[$key]);
  316. $param['style'] .= 'background: '.$val.'; ';
  317. break;
  318. case 'bordercolor':
  319. unset($param[$key]);
  320. $color = $val;
  321. break;
  322. case 'border':
  323. unset($param[$key]);
  324. if (preg_match('/^[0-9]+$/isU', $val)) $val = $val.'px';
  325. $border = $val;
  326. break;
  327. case 'cellpadding':
  328. case 'cellspacing':
  329. if (preg_match('/^([0-9]+)$/isU', $val)) $param[$key] = $val.'px';
  330. break;
  331. case 'colspan':
  332. case 'rowspan':
  333. $val = preg_replace('/[^0-9]/isU', '', $val);
  334. if (!$val) $val = 1;
  335. $param[$key] = $val;
  336. break;
  337. }
  338. }
  339. // compliance of the border
  340. if ($border!==null) {
  341. if ($border) $border = 'border: solid '.$border.' '.$color;
  342. else $border = 'border: none';
  343. $param['style'] .= $border.'; ';
  344. $param['border'] = $border;
  345. }
  346. // reading styles: decomposition and standardization
  347. $styles = explode(';', $param['style']);
  348. $param['style'] = array();
  349. foreach ($styles as $style) {
  350. $tmp = explode(':', $style);
  351. if (count($tmp)>1) {
  352. $cod = $tmp[0];
  353. unset($tmp[0]);
  354. $tmp = implode(':', $tmp);
  355. $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
  356. }
  357. }
  358. // determining the level of table opening, with an added level
  359. if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
  360. $this->_num++;
  361. $this->_level[count($this->_level)] = $this->_num;
  362. }
  363. // get the level of the table containing the element
  364. if (!isset($param['num'])) {
  365. $param['num'] = $this->_level[count($this->_level)-1];
  366. }
  367. // for closures table: remove a level
  368. if (in_array($name, array('ul', 'ol', 'table')) && $close) {
  369. unset($this->_level[count($this->_level)-1]);
  370. }
  371. // prepare the parameters
  372. if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
  373. if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
  374. if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
  375. if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
  376. // return the new action to do
  377. return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
  378. }
  379. /**
  380. * get a full level of HTML, between an opening and closing corresponding
  381. *
  382. * @param integer key
  383. * @return array actions
  384. */
  385. public function getLevel($k)
  386. {
  387. // if the code does not exist => return empty
  388. if (!isset($this->code[$k])) return array();
  389. // the tag to detect
  390. $detect = $this->code[$k]['name'];
  391. // if it is a text => return
  392. if ($detect=='write') {
  393. return array($this->code[$k]);
  394. }
  395. //
  396. $level = 0; // depth level
  397. $end = false; // end of the search
  398. $code = array(); // extract code
  399. // while it's not ended
  400. while (!$end) {
  401. // current action
  402. $row = $this->code[$k];
  403. // if 'write' => we add the text
  404. if ($row['name']=='write') {
  405. $code[] = $row;
  406. } else { // else, it is a html tag
  407. $not = false; // flag for not taking into account the current tag
  408. // if it is the searched tag
  409. if ($row['name']==$detect) {
  410. // if we are just at the root level => dont take it
  411. if ($level==0) {
  412. $not = true;
  413. }
  414. // update the level
  415. $level+= ($row['close'] ? -1 : 1);
  416. // if we are now at the root level => it is the end, and dont take it
  417. if ($level==0) {
  418. $not = true;
  419. $end = true;
  420. }
  421. }
  422. // if we can takin into account the current tag => save it
  423. if (!$not) {
  424. if (isset($row['style']['text-align'])) unset($row['style']['text-align']);
  425. $code[] = $row;
  426. }
  427. }
  428. // it continues as long as there has code to analise
  429. if (isset($this->code[$k+1]))
  430. $k++;
  431. else
  432. $end = true;
  433. }
  434. // return the extract
  435. return $code;
  436. }
  437. /**
  438. * return a part of the HTML code, for error message
  439. *
  440. * @param integer position
  441. * @param integer take before
  442. * @param integer take after
  443. * @return string part of the html code
  444. */
  445. public function getHtmlErrorCode($pos, $before=30, $after=40)
  446. {
  447. return substr($this->_html, $pos-$before, $before+$after);
  448. }
  449. }