PageRenderTime 54ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/application/third_party/PHPExcel/Reader/HTML.php

https://bitbucket.org/masangga/laperbanget
PHP | 499 lines | 320 code | 41 blank | 138 comment | 34 complexity | eb8dcfc09e11953f8667fd401ab6b8b5 MD5 | raw file
  1. <?php
  2. /**
  3. * PHPExcel
  4. *
  5. * Copyright (c) 2006 - 2012 PHPExcel
  6. *
  7. * This library is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * This library is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this library; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. *
  21. * @category PHPExcel
  22. * @package PHPExcel_Reader
  23. * @copyright Copyright (c) 2006 - 2012 PHPExcel (http://www.codeplex.com/PHPExcel)
  24. * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL
  25. * @version 1.7.8, 2012-10-12
  26. */
  27. /** PHPExcel root directory */
  28. if (!defined('PHPEXCEL_ROOT')) {
  29. /**
  30. * @ignore
  31. */
  32. define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
  33. require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
  34. }
  35. /**
  36. * PHPExcel_Reader_HTML
  37. *
  38. * @category PHPExcel
  39. * @package PHPExcel_Reader
  40. * @copyright Copyright (c) 2006 - 2012 PHPExcel (http://www.codeplex.com/PHPExcel)
  41. */
  42. class PHPExcel_Reader_HTML implements PHPExcel_Reader_IReader
  43. {
  44. /**
  45. * Input encoding
  46. *
  47. * @var string
  48. */
  49. private $_inputEncoding = 'ANSI';
  50. /**
  51. * Sheet index to read
  52. *
  53. * @var int
  54. */
  55. private $_sheetIndex = 0;
  56. /**
  57. * Formats
  58. *
  59. * @var array
  60. */
  61. private $_formats = array( 'h1' => array( 'font' => array( 'bold' => true,
  62. 'size' => 24,
  63. ),
  64. ), // Bold, 24pt
  65. 'h2' => array( 'font' => array( 'bold' => true,
  66. 'size' => 18,
  67. ),
  68. ), // Bold, 18pt
  69. 'h3' => array( 'font' => array( 'bold' => true,
  70. 'size' => 13.5,
  71. ),
  72. ), // Bold, 13.5pt
  73. 'h4' => array( 'font' => array( 'bold' => true,
  74. 'size' => 12,
  75. ),
  76. ), // Bold, 12pt
  77. 'h5' => array( 'font' => array( 'bold' => true,
  78. 'size' => 10,
  79. ),
  80. ), // Bold, 10pt
  81. 'h6' => array( 'font' => array( 'bold' => true,
  82. 'size' => 7.5,
  83. ),
  84. ), // Bold, 7.5pt
  85. 'a' => array( 'font' => array( 'underline' => true,
  86. 'color' => array( 'argb' => PHPExcel_Style_Color::COLOR_BLUE,
  87. ),
  88. ),
  89. ), // Blue underlined
  90. 'hr' => array( 'borders' => array( 'bottom' => array( 'style' => PHPExcel_Style_Border::BORDER_THIN,
  91. 'color' => array( PHPExcel_Style_Color::COLOR_BLACK,
  92. ),
  93. ),
  94. ),
  95. ), // Bottom border
  96. );
  97. /**
  98. * PHPExcel_Reader_IReadFilter instance
  99. *
  100. * @var PHPExcel_Reader_IReadFilter
  101. */
  102. private $_readFilter = null;
  103. /**
  104. * Create a new PHPExcel_Reader_HTML
  105. */
  106. public function __construct() {
  107. $this->_readFilter = new PHPExcel_Reader_DefaultReadFilter();
  108. }
  109. /**
  110. * Can the current PHPExcel_Reader_IReader read the file?
  111. *
  112. * @param string $pFileName
  113. * @return boolean
  114. * @throws Exception
  115. */
  116. public function canRead($pFilename)
  117. {
  118. // Check if file exists
  119. if (!file_exists($pFilename)) {
  120. throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  121. }
  122. // Read sample data (first 2 KB will do)
  123. $fh = fopen($pFilename, 'r');
  124. $data = fread($fh, 2048);
  125. fclose($fh);
  126. return true;
  127. }
  128. /**
  129. * Loads PHPExcel from file
  130. *
  131. * @param string $pFilename
  132. * @return PHPExcel
  133. * @throws Exception
  134. */
  135. public function load($pFilename)
  136. {
  137. // Create new PHPExcel
  138. $objPHPExcel = new PHPExcel();
  139. // Load into this instance
  140. return $this->loadIntoExisting($pFilename, $objPHPExcel);
  141. }
  142. /**
  143. * Read filter
  144. *
  145. * @return PHPExcel_Reader_IReadFilter
  146. */
  147. public function getReadFilter() {
  148. return $this->_readFilter;
  149. }
  150. /**
  151. * Set read filter
  152. *
  153. * @param PHPExcel_Reader_IReadFilter $pValue
  154. */
  155. public function setReadFilter(PHPExcel_Reader_IReadFilter $pValue) {
  156. $this->_readFilter = $pValue;
  157. return $this;
  158. }
  159. /**
  160. * Set input encoding
  161. *
  162. * @param string $pValue Input encoding
  163. */
  164. public function setInputEncoding($pValue = 'ANSI')
  165. {
  166. $this->_inputEncoding = $pValue;
  167. return $this;
  168. }
  169. /**
  170. * Get input encoding
  171. *
  172. * @return string
  173. */
  174. public function getInputEncoding()
  175. {
  176. return $this->_inputEncoding;
  177. }
  178. // Data Array used for testing only, should write to PHPExcel object on completion of tests
  179. private $_dataArray = array();
  180. private $_tableLevel = 0;
  181. private $_nestedColumn = array('A');
  182. private function _setTableStartColumn($column) {
  183. if ($this->_tableLevel == 0)
  184. $column = 'A';
  185. ++$this->_tableLevel;
  186. $this->_nestedColumn[$this->_tableLevel] = $column;
  187. return $this->_nestedColumn[$this->_tableLevel];
  188. }
  189. private function _getTableStartColumn() {
  190. return $this->_nestedColumn[$this->_tableLevel];
  191. }
  192. private function _releaseTableStartColumn() {
  193. --$this->_tableLevel;
  194. return array_pop($this->_nestedColumn);
  195. }
  196. private function _flushCell($sheet,$column,$row,&$cellContent) {
  197. if (is_string($cellContent)) {
  198. // Simple String content
  199. if (trim($cellContent) > '') {
  200. // Only actually write it if there's content in the string
  201. echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
  202. // Write to worksheet to be done here...
  203. // ... we return the cell so we can mess about with styles more easily
  204. $cell = $sheet->setCellValue($column.$row,$cellContent,true);
  205. $this->_dataArray[$row][$column] = $cellContent;
  206. }
  207. } else {
  208. // We have a Rich Text run
  209. // TODO
  210. $this->_dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
  211. }
  212. $cellContent = (string) '';
  213. }
  214. private function _processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent){
  215. foreach($element->childNodes as $child){
  216. if ($child instanceOf DOMText) {
  217. $domText = preg_replace('/\s+/',' ',trim($child->nodeValue));
  218. if (is_string($cellContent)) {
  219. // simply append the text if the cell content is a plain text string
  220. $cellContent .= $domText;
  221. } else {
  222. // but if we have a rich text run instead, we need to append it correctly
  223. // TODO
  224. }
  225. } elseif($child instanceOf DOMElement) {
  226. echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
  227. $attributeArray = array();
  228. foreach($child->attributes as $attribute) {
  229. echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
  230. $attributeArray[$attribute->name] = $attribute->value;
  231. }
  232. switch($child->nodeName) {
  233. case 'meta' :
  234. foreach($attributeArray as $attributeName => $attributeValue) {
  235. switch($attributeName) {
  236. case 'content':
  237. // TODO
  238. // Extract character set, so we can convert to UTF-8 if required
  239. break;
  240. }
  241. }
  242. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  243. break;
  244. case 'title' :
  245. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  246. $sheet->setTitle($cellContent);
  247. $cellContent = '';
  248. break;
  249. case 'span' :
  250. case 'div' :
  251. case 'font' :
  252. case 'i' :
  253. case 'em' :
  254. case 'strong':
  255. case 'b' :
  256. echo 'STYLING, SPAN OR DIV<br />';
  257. if ($cellContent > '')
  258. $cellContent .= ' ';
  259. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  260. if ($cellContent > '')
  261. $cellContent .= ' ';
  262. echo 'END OF STYLING, SPAN OR DIV<br />';
  263. break;
  264. case 'hr' :
  265. $this->_flushCell($sheet,$column,$row,$cellContent);
  266. ++$row;
  267. if (isset($this->_formats[$child->nodeName])) {
  268. $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]);
  269. } else {
  270. $cellContent = '----------';
  271. $this->_flushCell($sheet,$column,$row,$cellContent);
  272. }
  273. ++$row;
  274. case 'br' :
  275. if ($this->_tableLevel > 0) {
  276. // If we're inside a table, replace with a \n
  277. $cellContent .= "\n";
  278. } else {
  279. // Otherwise flush our existing content and move the row cursor on
  280. $this->_flushCell($sheet,$column,$row,$cellContent);
  281. ++$row;
  282. }
  283. echo 'HARD LINE BREAK: ' , '<br />';
  284. break;
  285. case 'a' :
  286. echo 'START OF HYPERLINK: ' , '<br />';
  287. foreach($attributeArray as $attributeName => $attributeValue) {
  288. switch($attributeName) {
  289. case 'href':
  290. echo 'Link to ' , $attributeValue , '<br />';
  291. $sheet->getCell($column.$row)->getHyperlink()->setUrl($attributeValue);
  292. if (isset($this->_formats[$child->nodeName])) {
  293. $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]);
  294. }
  295. break;
  296. }
  297. }
  298. $cellContent .= ' ';
  299. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  300. echo 'END OF HYPERLINK:' , '<br />';
  301. break;
  302. case 'h1' :
  303. case 'h2' :
  304. case 'h3' :
  305. case 'h4' :
  306. case 'h5' :
  307. case 'h6' :
  308. case 'ol' :
  309. case 'ul' :
  310. case 'p' :
  311. if ($this->_tableLevel > 0) {
  312. // If we're inside a table, replace with a \n
  313. $cellContent .= "\n";
  314. echo 'LIST ENTRY: ' , '<br />';
  315. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  316. echo 'END OF LIST ENTRY:' , '<br />';
  317. } else {
  318. if ($cellContent > '') {
  319. $this->_flushCell($sheet,$column,$row,$cellContent);
  320. $row += 2;
  321. }
  322. echo 'START OF PARAGRAPH: ' , '<br />';
  323. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  324. echo 'END OF PARAGRAPH:' , '<br />';
  325. $this->_flushCell($sheet,$column,$row,$cellContent);
  326. if (isset($this->_formats[$child->nodeName])) {
  327. $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]);
  328. }
  329. $row += 2;
  330. $column = 'A';
  331. }
  332. break;
  333. case 'li' :
  334. if ($this->_tableLevel > 0) {
  335. // If we're inside a table, replace with a \n
  336. $cellContent .= "\n";
  337. echo 'LIST ENTRY: ' , '<br />';
  338. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  339. echo 'END OF LIST ENTRY:' , '<br />';
  340. } else {
  341. if ($cellContent > '') {
  342. $this->_flushCell($sheet,$column,$row,$cellContent);
  343. }
  344. ++$row;
  345. echo 'LIST ENTRY: ' , '<br />';
  346. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  347. echo 'END OF LIST ENTRY:' , '<br />';
  348. $this->_flushCell($sheet,$column,$row,$cellContent);
  349. $column = 'A';
  350. }
  351. break;
  352. case 'table' :
  353. $this->_flushCell($sheet,$column,$row,$cellContent);
  354. $column = $this->_setTableStartColumn($column);
  355. echo 'START OF TABLE LEVEL ' , $this->_tableLevel , '<br />';
  356. if ($this->_tableLevel > 1)
  357. --$row;
  358. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  359. echo 'END OF TABLE LEVEL ' , $this->_tableLevel , '<br />';
  360. $column = $this->_releaseTableStartColumn();
  361. if ($this->_tableLevel > 1) {
  362. ++$column;
  363. } else {
  364. ++$row;
  365. }
  366. break;
  367. case 'thead' :
  368. case 'tbody' :
  369. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  370. break;
  371. case 'tr' :
  372. ++$row;
  373. $column = $this->_getTableStartColumn();
  374. $cellContent = '';
  375. echo 'START OF TABLE ' , $this->_tableLevel , ' ROW<br />';
  376. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  377. echo 'END OF TABLE ' , $this->_tableLevel , ' ROW<br />';
  378. break;
  379. case 'th' :
  380. case 'td' :
  381. echo 'START OF TABLE ' , $this->_tableLevel , ' CELL<br />';
  382. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  383. echo 'END OF TABLE ' , $this->_tableLevel , ' CELL<br />';
  384. $this->_flushCell($sheet,$column,$row,$cellContent);
  385. ++$column;
  386. break;
  387. case 'body' :
  388. $row = 1;
  389. $column = 'A';
  390. $content = '';
  391. $this->_tableLevel = 0;
  392. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  393. break;
  394. default:
  395. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  396. }
  397. }
  398. }
  399. }
  400. /**
  401. * Loads PHPExcel from file into PHPExcel instance
  402. *
  403. * @param string $pFilename
  404. * @param PHPExcel $objPHPExcel
  405. * @return PHPExcel
  406. * @throws Exception
  407. */
  408. public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
  409. {
  410. // Check if file exists
  411. if (!file_exists($pFilename)) {
  412. throw new Exception("Could not open " . $pFilename . " for reading! File does not exist.");
  413. }
  414. if (!is_file($pFilename)) {
  415. throw new Exception("Could not open " . $pFilename . " for reading! The given file is not a regular file.");
  416. }
  417. // Create new PHPExcel
  418. while ($objPHPExcel->getSheetCount() <= $this->_sheetIndex) {
  419. $objPHPExcel->createSheet();
  420. }
  421. $objPHPExcel->setActiveSheetIndex( $this->_sheetIndex );
  422. // Create a new DOM object
  423. $dom = new domDocument;
  424. // Load the HTML file into the DOM object
  425. $loaded = $dom->loadHTMLFile($pFilename);
  426. if ($loaded === false) {
  427. throw new Exception('Failed to load ',$pFilename,' as a DOM Document');
  428. }
  429. // Discard white space
  430. $dom->preserveWhiteSpace = false;
  431. $row = 0;
  432. $column = 'A';
  433. $content = '';
  434. $this->_processDomElement($dom,$objPHPExcel->getActiveSheet(),$row,$column,$content);
  435. echo '<hr />';
  436. var_dump($this->_dataArray);
  437. // Return
  438. return $objPHPExcel;
  439. }
  440. /**
  441. * Get sheet index
  442. *
  443. * @return int
  444. */
  445. public function getSheetIndex() {
  446. return $this->_sheetIndex;
  447. }
  448. /**
  449. * Set sheet index
  450. *
  451. * @param int $pValue Sheet index
  452. * @return PHPExcel_Reader_HTML
  453. */
  454. public function setSheetIndex($pValue = 0) {
  455. $this->_sheetIndex = $pValue;
  456. return $this;
  457. }
  458. }