/wp-content/plugins/tablepress/libraries/csv-parser.class.php
https://bitbucket.org/mrjt515/tcsa · PHP · 270 lines · 156 code · 25 blank · 89 comment · 68 complexity · 91d4426aa7ac6ce87cef6a1cc0dea9f2 MD5 · raw file
- <?php
- /**
- * CSV Parsing class for TablePress, used for import of CSV files
- *
- * @package TablePress
- * @subpackage Import
- * @author Tobias Bäthge
- * @since 1.0.0
- */
- // Prohibit direct script loading
- defined( 'ABSPATH' ) || die( 'No direct script access allowed!' );
- /**
- * CSV Parsing class
- * @package TablePress
- * @subpackage Import
- * @author Tobias Bäthge
- * @since 1.0.0
- */
- class CSV_Parser {
- // enclosure (double quote)
- protected $enclosure = '"';
- // number of rows to analyze when attempting to auto-detect delimiter
- protected $delimiter_search_max_lines = 15;
- // characters to ignore when attempting to auto-detect delimiter
- protected $non_delimiter_chars = "a-zA-Z0-9\n\r";
- // preferred delimiter characters, only used when all filtering method
- // returns multiple possible delimiters (happens very rarely)
- protected $preferred_delimiter_chars = ";,\t";
- // data to import
- protected $import_data;
- // error while parsing input data
- // 0 = No errors found. Everything should be fine :)
- // 1 = Hopefully correctable syntax error was found.
- // 2 = Enclosure character (double quote by default) was found in non-enclosed field.
- // This means the file is either corrupt, or does not standard CSV formatting.
- // Please validate the parsed data yourself.
- public $error = 0;
- // detailed error info
- public $error_info = array();
- /**
- * Constructor
- *
- * @since 1.0.0
- */
- public function __construct() {
- // intentionally left blank
- }
- /**
- * Load data that shall be parsed
- *
- * @since 1.0.0
- *
- * @param string $data Data to be parsed
- */
- public function load_data( $data ) {
- // check for mandatory trailing line break
- if ( substr( $data, -1 ) != "\n" )
- $data .= "\n";
- $this->import_data = &$data;
- }
- /**
- * Detect the CSV delimiter, by analyzing some rows to determine most probable delimiter character
- *
- * @since 1.0.0
- *
- * @return string Most probable delimiter character
- */
- public function find_delimiter() {
- $data = &$this->import_data;
- $delimiter_count = array();
- $enclosed = false;
- $current_line = 0;
- // walk through each character in the CSV string (up to $this->delimiter_search_max_lines)
- // and search potential delimiter characters
- $data_length = strlen( $data );
- for ( $i = 0; $i < $data_length; $i++ ) {
- $prev_char = ( $i-1 >= 0 ) ? $data[$i-1] : '';
- $curr_char = $data[$i];
- $next_char = ( $i+1 < $data_length ) ? $data[$i+1] : '';
- if ( $curr_char == $this->enclosure ) {
- // open and closing quotes
- if ( ! $enclosed || $next_char != $this->enclosure )
- $enclosed = ! $enclosed; // flip bool
- elseif ( $enclosed )
- $i++; // skip next character
- } elseif ( ( "\n" == $curr_char && "\r" != $prev_char || "\r" == $curr_char ) && ! $enclosed ) {
- // reached end of a line
- $current_line++;
- if ( $current_line >= $this->delimiter_search_max_lines )
- break;
- } elseif ( ! $enclosed ) {
- // at this point $curr_char seems to be used as a delimiter, as it is not enclosed
- // count $curr_char if it is not in the non_delimiter_chars list
- if ( 0 === preg_match( '#[' . $this->non_delimiter_chars . ']#i', $curr_char ) ) {
- if ( ! isset( $delimiter_count[$curr_char][$current_line] ) )
- $delimiter_count[$curr_char][$current_line] = 0; // init empty
- $delimiter_count[$curr_char][$current_line]++;
- }
- }
- }
- // find most probable delimiter, by sorting their counts
- $potential_delimiters = array();
- foreach ( $delimiter_count as $char => $line_counts ) {
- $is_possible_delimiter = $this->_check_delimiter_count( $char, $line_counts, $current_line );
- if ( false !== $is_possible_delimiter )
- $potential_delimiters[$is_possible_delimiter] = $char;
- }
- ksort( $potential_delimiters );
- // return first array element, as that has the highest count
- return array_shift( $potential_delimiters );
- }
- /**
- * Check if passed character can be a delimiter, by checking counts in each line
- *
- * @since 1.0.0
- *
- * @param string|char $char Character to check
- * @param array $line_counts
- * @param int $number_lines
- * @return bool|string False if delimiter is not possible, string to be used as a sort key if character could be a delimiter
- */
- protected function _check_delimiter_count( $char, $line_counts, $number_lines ) {
- // was potential delimiter found in every line?
- if ( count( $line_counts ) != $number_lines )
- return false;
- // check if count in every line is the same (or one higher for "almost")
- $first = null;
- $equal = null;
- $almost = false;
- foreach ( $line_counts as $line => $count ) {
- if ( null == $first ) {
- $first = $count;
- } elseif ( $count == $first && false !== $equal ) {
- $equal = true;
- } elseif ( $count == $first + 1 && false !== $equal ) {
- $equal = true;
- $almost = true;
- } else {
- $equal = false;
- }
- }
- // check equality only if more than one row
- if ( $number_lines > 1 && ! $equal )
- return false;
- // at this point, count is equal in all lines, determine a string to sort priority
- $match = ( $almost ) ? 2 : 1 ;
- $pref = strpos( $this->preferred_delimiter_chars, $char );
- $pref = ( false !== $pref ) ? str_pad( $pref, 3, '0', STR_PAD_LEFT ) : '999';
- return $pref . $match . '.' . ( 99999 - str_pad( $first, 5, '0', STR_PAD_LEFT ) );
- }
- /**
- * Parse CSV string into 2D array
- *
- * @since 1.0.0
- *
- * @param string $delimiter Delimiter character for the CSV parsing
- * @return array 2D array with the data from the CSV string
- */
- public function parse( $delimiter ) {
- $data = &$this->import_data;
- $white_spaces = str_replace( $delimiter, '', " \t\x0B\0" ); // filter delimiter from the list, if it is a white-space character
- $rows = array(); // complete rows
- $row = array(); // row that is currently built
- $column = 0; // current column index
- $cell_content = ''; // content of the currently processed cell
- $enclosed = false;
- $was_enclosed = false; // to determine if cell content will be trimmed of white-space (only for enclosed cells)
- // walk through each character in the CSV string
- $data_length = strlen( $data );
- for ( $i = 0; $i < $data_length; $i++ ) {
- $curr_char = $data[$i];
- $next_char = ( $i+1 < $data_length ) ? $data[$i+1] : '';
- if ( $curr_char == $this->enclosure ) {
- // open/close quotes, and inline quotes
- if ( ! $enclosed ) {
- if ( '' == ltrim( $cell_content, $white_spaces ) ) {
- $enclosed = true;
- $was_enclosed = true;
- } else {
- $this->error = 2;
- $error_line = count( $rows ) + 1;
- $error_column = $column + 1;
- if ( ! isset( $this->error_info[ $error_line.'-'.$error_column ] ) ) {
- $this->error_info[ $error_line.'-'.$error_column ] = array(
- 'type' => 2,
- 'info' => "Syntax error found in line {$error_line}. Non-enclosed fields can not contain double-quotes.",
- 'line' => $error_line,
- 'column' => $error_column
- );
- }
- $cell_content .= $curr_char;
- }
- } elseif ( $next_char == $this->enclosure ) {
- // enclosure character within enclosed cell (" encoded as "")
- $cell_content .= $curr_char;
- $i++; // skip next character
- } elseif ( $next_char != $delimiter && "\r" != $next_char && "\n" != $next_char ) {
- // for-loop (instead of while-loop) that skips white-space
- for ( $x = ( $i+1 ); isset( $data[$x] ) && '' == ltrim( $data[$x], $white_spaces ); $x++ ) {}
- if ( $data[$x] == $delimiter ) {
- $enclosed = false;
- $i = $x;
- } else {
- if ( $this->error < 1 )
- $this->error = 1;
- $error_line = count( $rows ) + 1;
- $error_column = $column + 1;
- if ( ! isset( $this->error_info[ $error_line.'-'.$error_column ] ) ) {
- $this->error_info[ $error_line.'-'.$error_column ] = array(
- 'type' => 1,
- 'info' => "Syntax error found in line {$error_line}. A single double-quote was found within an enclosed string. Enclosed double-quotes must be escaped with a second double-quote.",
- 'line' => $error_line,
- 'column' => $error_column
- );
- }
- $cell_content .= $curr_char;
- $enclosed = false;
- }
- } else {
- // the " was the closing one for the cell
- $enclosed = false;
- }
- } elseif ( ( $curr_char == $delimiter || "\n" == $curr_char || "\r" == $curr_char ) && ! $enclosed ) {
- // end of cell (by $delimiter), or end of line (by line break, and not enclosed!)
- $row[$column] = ( $was_enclosed ) ? $cell_content : trim( $cell_content );
- $cell_content = '';
- $was_enclosed = false;
- $column++;
- // end of line
- if ( "\n" == $curr_char || "\r" == $curr_char ) {
- // append completed row
- $rows[] = $row;
- $row = array();
- $column = 0;
- if ( "\r" == $curr_char && "\n" == $next_char )
- $i++; // skip next character in \r\n line breaks
- }
- } else {
- // append character to current cell
- $cell_content .= $curr_char;
- }
- }
- return $rows;
- }
- } // class CSV_Parser