PageRenderTime 42ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/oiserver/lib/common/tableExtractor.php

http://openirudi.googlecode.com/
PHP | 383 lines | 212 code | 95 blank | 76 comment | 35 complexity | 3b393270cada5b2a6feb6853489f1e84 MD5 | raw file
Possible License(s): LGPL-2.1, AGPL-3.0
  1. <?php
  2. /*----------------------------------------------------------------------
  3. Table Extractor
  4. ===============
  5. Table extractor is a php class that can extract almost any table
  6. from any html document/page, and then convert that html table into
  7. a php array.
  8. Version 1.3
  9. Compatibility: PHP 4.4.1 +
  10. Copyright Jack Sleight - www.reallyshiny.com
  11. This script is licensed under the Creative Commons License.
  12. ----------------------------------------------------------------------*/
  13. class tableExtractor {
  14. var $source = NULL;
  15. var $anchor = NULL;
  16. var $anchorWithin = false;
  17. var $headerRow = true;
  18. var $startRow = 1;
  19. var $maxRows = 0;
  20. var $startCol = 1;
  21. var $maxCols = 0;
  22. var $stripTags = false;
  23. var $extraCols = array();
  24. var $rowCount = 0;
  25. var $dropRows = NULL;
  26. var $cleanHTML = NULL;
  27. var $rawArray = NULL;
  28. var $finalArray = NULL;
  29. /*--------------------------------------------------
  30. --------------------------------------------------*/
  31. function extractTable() {
  32. $this->cleanHTML();
  33. $this->prepareArray();
  34. return $this->createArray();
  35. }
  36. /*--------------------------------------------------
  37. --------------------------------------------------*/
  38. function cleanHTML() {
  39. // php 4 compatibility functions
  40. if(!function_exists('stripos')) {
  41. function stripos($haystack,$needle,$offset = 0) {
  42. return(strpos(strtolower($haystack),strtolower($needle),$offset));
  43. }
  44. }
  45. // find unique string that appears before the table you want to extract
  46. if ($this->anchorWithin) {
  47. /*------------------------------------------------------------
  48. With thanks to Khary Sharp for suggesting and writing
  49. the anchor within functionality.
  50. ------------------------------------------------------------*/
  51. $anchorPos = stripos($this->source, $this->anchor) + strlen($this->anchor);
  52. $sourceSnippet = strrev(substr($this->source, 0, $anchorPos));
  53. $tablePos = stripos($sourceSnippet, strrev(("<table"))) + 6;
  54. $startSearch = strlen($sourceSnippet) - $tablePos;
  55. }
  56. else {
  57. $startSearch = stripos($this->source, $this->anchor);
  58. }
  59. // extract table
  60. $startTable = stripos($this->source, '<table', $startSearch);
  61. $endTable = stripos($this->source, '</table>', $startTable) + 8;
  62. $table = substr($this->source, $startTable, $endTable - $startTable);
  63. if(!function_exists('lcase_tags')) {
  64. function lcase_tags($input) {
  65. return strtolower($input[0]);
  66. }
  67. }
  68. // lowercase all table related tags
  69. $table = preg_replace_callback('/<(\/?)(table|tr|th|td)/is', 'lcase_tags', $table);
  70. // remove all thead and tbody tags
  71. $table = preg_replace('/<\/?(thead|tbody).*?>/is', '', $table);
  72. // replace th tags with td tags
  73. $table = preg_replace('/<(\/?)th(.*?)>/is', '<$1td$2>', $table);
  74. // clean string
  75. $table = trim($table);
  76. $table = str_replace("\r\n", "", $table);
  77. $this->cleanHTML = $table;
  78. }
  79. /*--------------------------------------------------
  80. --------------------------------------------------*/
  81. function prepareArray() {
  82. // split table into individual elements
  83. $pattern = '/(<\/?(?:tr|td).*?>)/is';
  84. $table = preg_split($pattern, $this->cleanHTML, -1, PREG_SPLIT_DELIM_CAPTURE);
  85. // define array for new table
  86. $tableCleaned = array();
  87. // define variables for looping through table
  88. $rowCount = 0;
  89. $colCount = 1;
  90. $trOpen = false;
  91. $tdOpen = false;
  92. // loop through table
  93. foreach($table as $item) {
  94. // trim item
  95. $item = str_replace('&nbsp;', '', $item);
  96. $item = trim($item);
  97. // save the item
  98. $itemUnedited = $item;
  99. // clean if tag
  100. $item = preg_replace('/<(\/?)(table|tr|td).*?>/is', '<$1$2>', $item);
  101. // pick item type
  102. switch ($item) {
  103. case '<tr>':
  104. // start a new row
  105. $rowCount++;
  106. $colCount = 1;
  107. $trOpen = true;
  108. break;
  109. case '<td>':
  110. // save the td tag for later use
  111. $tdTag = $itemUnedited;
  112. $tdOpen = true;
  113. break;
  114. case '</td>':
  115. $tdOpen = false;
  116. break;
  117. case '</tr>':
  118. $trOpen = false;
  119. break;
  120. default :
  121. // if a TD tag is open
  122. if($tdOpen) {
  123. // check if td tag contained colspan
  124. if(preg_match('/<td [^>]*colspan\s*=\s*(?:\'|")?\s*([0-9]+)[^>]*>/is', $tdTag, $matches))
  125. $colspan = $matches[1];
  126. else
  127. $colspan = 1;
  128. // check if td tag contained rowspan
  129. if(preg_match('/<td [^>]*rowspan\s*=\s*(?:\'|")?\s*([0-9]+)[^>]*>/is', $tdTag, $matches))
  130. $rowspan = $matches[1];
  131. else
  132. $rowspan = 0;
  133. // loop over the colspans
  134. for($c = 0; $c < $colspan; $c++) {
  135. // if the item data has not already been defined by a rowspan loop, set it
  136. if(!isset($tableCleaned[$rowCount][$colCount]))
  137. $tableCleaned[$rowCount][$colCount] = $item;
  138. else
  139. $tableCleaned[$rowCount][$colCount + 1] = $item;
  140. // create new rowCount variable for looping through rowspans
  141. $futureRows = $rowCount;
  142. // loop through row spans
  143. for($r = 1; $r < $rowspan; $r++) {
  144. $futureRows++;
  145. if($colspan > 1)
  146. $tableCleaned[$futureRows][$colCount + 1] = $item;
  147. else
  148. $tableCleaned[$futureRows][$colCount] = $item;
  149. }
  150. // increase column count
  151. $colCount++;
  152. }
  153. // sort the row array by the column keys (as inserting rowspans screws up the order)
  154. ksort($tableCleaned[$rowCount]);
  155. }
  156. break;
  157. }
  158. }
  159. // set row count
  160. if($this->headerRow)
  161. $this->rowCount = count($tableCleaned) - 1;
  162. else
  163. $this->rowCount = count($tableCleaned);
  164. $this->rawArray = $tableCleaned;
  165. }
  166. /*--------------------------------------------------
  167. --------------------------------------------------*/
  168. function createArray() {
  169. // define array to store table data
  170. $tableData = array();
  171. // get column headers
  172. if($this->headerRow) {
  173. // trim string
  174. $row = $this->rawArray[$this->headerRow];
  175. // set column names array
  176. $columnNames = array();
  177. $uniqueNames = array();
  178. // loop over column names
  179. $colCount = 0;
  180. foreach($row as $cell) {
  181. $colCount++;
  182. $cell = strip_tags($cell);
  183. $cell = trim($cell);
  184. // save name if there is one, otherwise save index
  185. if($cell) {
  186. if(isset($uniqueNames[$cell])) {
  187. $uniqueNames[$cell]++;
  188. $cell .= ' ('.($uniqueNames[$cell] + 1).')';
  189. }
  190. else {
  191. $uniqueNames[$cell] = 0;
  192. }
  193. $columnNames[$colCount] = $cell;
  194. }
  195. else
  196. $columnNames[$colCount] = $colCount;
  197. }
  198. // remove the headers row from the table
  199. unset($this->rawArray[$this->headerRow]);
  200. }
  201. // remove rows to drop
  202. foreach(explode(',', $this->dropRows) as $key => $value) {
  203. unset($this->rawArray[$value]);
  204. }
  205. // set the end row
  206. if($this->maxRows)
  207. $endRow = $this->startRow + $this->maxRows - 1;
  208. else
  209. $endRow = count($this->rawArray);
  210. // loop over row array
  211. $rowCount = 0;
  212. $newRowCount = 0;
  213. foreach($this->rawArray as $row) {
  214. $rowCount++;
  215. // if the row was requested then add it
  216. if($rowCount >= $this->startRow && $rowCount <= $endRow) {
  217. $newRowCount++;
  218. // create new array to store data
  219. $tableData[$newRowCount] = array();
  220. //$tableData[$newRowCount]['origRow'] = $rowCount;
  221. //$tableData[$newRowCount]['data'] = array();
  222. $tableData[$newRowCount] = array();
  223. // set the end column
  224. if($this->maxCols)
  225. $endCol = $this->startCol + $this->maxCols - 1;
  226. else
  227. $endCol = count($row);
  228. // loop over cell array
  229. $colCount = 0;
  230. $newColCount = 0;
  231. foreach($row as $cell) {
  232. $colCount++;
  233. // if the column was requested then add it
  234. if($colCount >= $this->startCol && $colCount <= $endCol) {
  235. $newColCount++;
  236. if($this->extraCols) {
  237. foreach($this->extraCols as $extraColumn) {
  238. if($extraColumn['column'] == $colCount) {
  239. if(preg_match($extraColumn['regex'], $cell, $matches)) {
  240. if(is_array($extraColumn['names'])) {
  241. $this->extraColsCount = 0;
  242. foreach($extraColumn['names'] as $extraColumnSub) {
  243. $this->extraColsCount++;
  244. $tableData[$newRowCount][$extraColumnSub] = $matches[$this->extraColsCount];
  245. }
  246. } else {
  247. $tableData[$newRowCount][$extraColumn['names']] = $matches[1];
  248. }
  249. } else {
  250. $this->extraColsCount = 0;
  251. if(is_array($extraColumn['names'])) {
  252. $this->extraColsCount = 0;
  253. foreach($extraColumn['names'] as $extraColumnSub) {
  254. $this->extraColsCount++;
  255. $tableData[$newRowCount][$extraColumnSub] = '';
  256. }
  257. } else {
  258. $tableData[$newRowCount][$extraColumn['names']] = '';
  259. }
  260. }
  261. }
  262. }
  263. }
  264. if($this->stripTags)
  265. $cell = strip_tags($cell);
  266. // set the column key as the column number
  267. $colKey = $newColCount;
  268. // if there is a table header, use the column name as the key
  269. if($this->headerRow)
  270. if(isset($columnNames[$colCount]))
  271. $colKey = $columnNames[$colCount];
  272. // add the data to the array
  273. //$tableData[$newRowCount]['data'][$colKey] = $cell;
  274. $tableData[$newRowCount][$colKey] = $cell;
  275. }
  276. }
  277. }
  278. }
  279. $this->finalArray = $tableData;
  280. return $tableData;
  281. }
  282. }
  283. ?>