/wp-content/plugins/tablepress/libraries/csv-parser.class.php

https://bitbucket.org/mrjt515/tcsa · PHP · 270 lines · 156 code · 25 blank · 89 comment · 68 complexity · 91d4426aa7ac6ce87cef6a1cc0dea9f2 MD5 · raw file

  1. <?php
  2. /**
  3. * CSV Parsing class for TablePress, used for import of CSV files
  4. *
  5. * @package TablePress
  6. * @subpackage Import
  7. * @author Tobias Bäthge
  8. * @since 1.0.0
  9. */
  10. // Prohibit direct script loading
  11. defined( 'ABSPATH' ) || die( 'No direct script access allowed!' );
  12. /**
  13. * CSV Parsing class
  14. * @package TablePress
  15. * @subpackage Import
  16. * @author Tobias Bäthge
  17. * @since 1.0.0
  18. */
  19. class CSV_Parser {
  20. // enclosure (double quote)
  21. protected $enclosure = '"';
  22. // number of rows to analyze when attempting to auto-detect delimiter
  23. protected $delimiter_search_max_lines = 15;
  24. // characters to ignore when attempting to auto-detect delimiter
  25. protected $non_delimiter_chars = "a-zA-Z0-9\n\r";
  26. // preferred delimiter characters, only used when all filtering method
  27. // returns multiple possible delimiters (happens very rarely)
  28. protected $preferred_delimiter_chars = ";,\t";
  29. // data to import
  30. protected $import_data;
  31. // error while parsing input data
  32. // 0 = No errors found. Everything should be fine :)
  33. // 1 = Hopefully correctable syntax error was found.
  34. // 2 = Enclosure character (double quote by default) was found in non-enclosed field.
  35. // This means the file is either corrupt, or does not standard CSV formatting.
  36. // Please validate the parsed data yourself.
  37. public $error = 0;
  38. // detailed error info
  39. public $error_info = array();
  40. /**
  41. * Constructor
  42. *
  43. * @since 1.0.0
  44. */
  45. public function __construct() {
  46. // intentionally left blank
  47. }
  48. /**
  49. * Load data that shall be parsed
  50. *
  51. * @since 1.0.0
  52. *
  53. * @param string $data Data to be parsed
  54. */
  55. public function load_data( $data ) {
  56. // check for mandatory trailing line break
  57. if ( substr( $data, -1 ) != "\n" )
  58. $data .= "\n";
  59. $this->import_data = &$data;
  60. }
  61. /**
  62. * Detect the CSV delimiter, by analyzing some rows to determine most probable delimiter character
  63. *
  64. * @since 1.0.0
  65. *
  66. * @return string Most probable delimiter character
  67. */
  68. public function find_delimiter() {
  69. $data = &$this->import_data;
  70. $delimiter_count = array();
  71. $enclosed = false;
  72. $current_line = 0;
  73. // walk through each character in the CSV string (up to $this->delimiter_search_max_lines)
  74. // and search potential delimiter characters
  75. $data_length = strlen( $data );
  76. for ( $i = 0; $i < $data_length; $i++ ) {
  77. $prev_char = ( $i-1 >= 0 ) ? $data[$i-1] : '';
  78. $curr_char = $data[$i];
  79. $next_char = ( $i+1 < $data_length ) ? $data[$i+1] : '';
  80. if ( $curr_char == $this->enclosure ) {
  81. // open and closing quotes
  82. if ( ! $enclosed || $next_char != $this->enclosure )
  83. $enclosed = ! $enclosed; // flip bool
  84. elseif ( $enclosed )
  85. $i++; // skip next character
  86. } elseif ( ( "\n" == $curr_char && "\r" != $prev_char || "\r" == $curr_char ) && ! $enclosed ) {
  87. // reached end of a line
  88. $current_line++;
  89. if ( $current_line >= $this->delimiter_search_max_lines )
  90. break;
  91. } elseif ( ! $enclosed ) {
  92. // at this point $curr_char seems to be used as a delimiter, as it is not enclosed
  93. // count $curr_char if it is not in the non_delimiter_chars list
  94. if ( 0 === preg_match( '#[' . $this->non_delimiter_chars . ']#i', $curr_char ) ) {
  95. if ( ! isset( $delimiter_count[$curr_char][$current_line] ) )
  96. $delimiter_count[$curr_char][$current_line] = 0; // init empty
  97. $delimiter_count[$curr_char][$current_line]++;
  98. }
  99. }
  100. }
  101. // find most probable delimiter, by sorting their counts
  102. $potential_delimiters = array();
  103. foreach ( $delimiter_count as $char => $line_counts ) {
  104. $is_possible_delimiter = $this->_check_delimiter_count( $char, $line_counts, $current_line );
  105. if ( false !== $is_possible_delimiter )
  106. $potential_delimiters[$is_possible_delimiter] = $char;
  107. }
  108. ksort( $potential_delimiters );
  109. // return first array element, as that has the highest count
  110. return array_shift( $potential_delimiters );
  111. }
  112. /**
  113. * Check if passed character can be a delimiter, by checking counts in each line
  114. *
  115. * @since 1.0.0
  116. *
  117. * @param string|char $char Character to check
  118. * @param array $line_counts
  119. * @param int $number_lines
  120. * @return bool|string False if delimiter is not possible, string to be used as a sort key if character could be a delimiter
  121. */
  122. protected function _check_delimiter_count( $char, $line_counts, $number_lines ) {
  123. // was potential delimiter found in every line?
  124. if ( count( $line_counts ) != $number_lines )
  125. return false;
  126. // check if count in every line is the same (or one higher for "almost")
  127. $first = null;
  128. $equal = null;
  129. $almost = false;
  130. foreach ( $line_counts as $line => $count ) {
  131. if ( null == $first ) {
  132. $first = $count;
  133. } elseif ( $count == $first && false !== $equal ) {
  134. $equal = true;
  135. } elseif ( $count == $first + 1 && false !== $equal ) {
  136. $equal = true;
  137. $almost = true;
  138. } else {
  139. $equal = false;
  140. }
  141. }
  142. // check equality only if more than one row
  143. if ( $number_lines > 1 && ! $equal )
  144. return false;
  145. // at this point, count is equal in all lines, determine a string to sort priority
  146. $match = ( $almost ) ? 2 : 1 ;
  147. $pref = strpos( $this->preferred_delimiter_chars, $char );
  148. $pref = ( false !== $pref ) ? str_pad( $pref, 3, '0', STR_PAD_LEFT ) : '999';
  149. return $pref . $match . '.' . ( 99999 - str_pad( $first, 5, '0', STR_PAD_LEFT ) );
  150. }
  151. /**
  152. * Parse CSV string into 2D array
  153. *
  154. * @since 1.0.0
  155. *
  156. * @param string $delimiter Delimiter character for the CSV parsing
  157. * @return array 2D array with the data from the CSV string
  158. */
  159. public function parse( $delimiter ) {
  160. $data = &$this->import_data;
  161. $white_spaces = str_replace( $delimiter, '', " \t\x0B\0" ); // filter delimiter from the list, if it is a white-space character
  162. $rows = array(); // complete rows
  163. $row = array(); // row that is currently built
  164. $column = 0; // current column index
  165. $cell_content = ''; // content of the currently processed cell
  166. $enclosed = false;
  167. $was_enclosed = false; // to determine if cell content will be trimmed of white-space (only for enclosed cells)
  168. // walk through each character in the CSV string
  169. $data_length = strlen( $data );
  170. for ( $i = 0; $i < $data_length; $i++ ) {
  171. $curr_char = $data[$i];
  172. $next_char = ( $i+1 < $data_length ) ? $data[$i+1] : '';
  173. if ( $curr_char == $this->enclosure ) {
  174. // open/close quotes, and inline quotes
  175. if ( ! $enclosed ) {
  176. if ( '' == ltrim( $cell_content, $white_spaces ) ) {
  177. $enclosed = true;
  178. $was_enclosed = true;
  179. } else {
  180. $this->error = 2;
  181. $error_line = count( $rows ) + 1;
  182. $error_column = $column + 1;
  183. if ( ! isset( $this->error_info[ $error_line.'-'.$error_column ] ) ) {
  184. $this->error_info[ $error_line.'-'.$error_column ] = array(
  185. 'type' => 2,
  186. 'info' => "Syntax error found in line {$error_line}. Non-enclosed fields can not contain double-quotes.",
  187. 'line' => $error_line,
  188. 'column' => $error_column
  189. );
  190. }
  191. $cell_content .= $curr_char;
  192. }
  193. } elseif ( $next_char == $this->enclosure ) {
  194. // enclosure character within enclosed cell (" encoded as "")
  195. $cell_content .= $curr_char;
  196. $i++; // skip next character
  197. } elseif ( $next_char != $delimiter && "\r" != $next_char && "\n" != $next_char ) {
  198. // for-loop (instead of while-loop) that skips white-space
  199. for ( $x = ( $i+1 ); isset( $data[$x] ) && '' == ltrim( $data[$x], $white_spaces ); $x++ ) {}
  200. if ( $data[$x] == $delimiter ) {
  201. $enclosed = false;
  202. $i = $x;
  203. } else {
  204. if ( $this->error < 1 )
  205. $this->error = 1;
  206. $error_line = count( $rows ) + 1;
  207. $error_column = $column + 1;
  208. if ( ! isset( $this->error_info[ $error_line.'-'.$error_column ] ) ) {
  209. $this->error_info[ $error_line.'-'.$error_column ] = array(
  210. 'type' => 1,
  211. 'info' => "Syntax error found in line {$error_line}. A single double-quote was found within an enclosed string. Enclosed double-quotes must be escaped with a second double-quote.",
  212. 'line' => $error_line,
  213. 'column' => $error_column
  214. );
  215. }
  216. $cell_content .= $curr_char;
  217. $enclosed = false;
  218. }
  219. } else {
  220. // the " was the closing one for the cell
  221. $enclosed = false;
  222. }
  223. } elseif ( ( $curr_char == $delimiter || "\n" == $curr_char || "\r" == $curr_char ) && ! $enclosed ) {
  224. // end of cell (by $delimiter), or end of line (by line break, and not enclosed!)
  225. $row[$column] = ( $was_enclosed ) ? $cell_content : trim( $cell_content );
  226. $cell_content = '';
  227. $was_enclosed = false;
  228. $column++;
  229. // end of line
  230. if ( "\n" == $curr_char || "\r" == $curr_char ) {
  231. // append completed row
  232. $rows[] = $row;
  233. $row = array();
  234. $column = 0;
  235. if ( "\r" == $curr_char && "\n" == $next_char )
  236. $i++; // skip next character in \r\n line breaks
  237. }
  238. } else {
  239. // append character to current cell
  240. $cell_content .= $curr_char;
  241. }
  242. }
  243. return $rows;
  244. }
  245. } // class CSV_Parser