PageRenderTime 24ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 1ms

/Csv/AutoDetect.php

http://php-csv-utils.googlecode.com/
PHP | 423 lines | 237 code | 61 blank | 125 comment | 66 complexity | c93aad0764c04409a1121a89ae488959 MD5 | raw file
  1. <?php
  2. /**
  3. * AutoDetect component
  4. *
  5. * This class accepts a sample of csv and attempts to deduce its format. It then
  6. * can return a Csv_Dialect tailored to that particular csv file
  7. *
  8. * Please read the LICENSE file
  9. *
  10. * @package PHP CSV Utilities
  11. * @subpackage AutoDetect
  12. * @copyright (c) 2010 Luke Visinoni <luke.visinoni@gmail.com>
  13. * @author Luke Visinoni <luke.visinoni@gmail.com>
  14. * @license GNU Lesser General Public License
  15. * @version $Id: AutoDetect.php 86 2010-04-26 03:44:15Z luke.visinoni $
  16. */
  17. class Csv_AutoDetect {
  18. /**
  19. * Attempts to deduce the format of a sample of a csv file and returns a dialect object
  20. * eventually it will throw an exception if it can't deduce the format, but for now it just
  21. * returns the basic csv dialect
  22. *
  23. * @param string A piece of sample data used to deduce the format of the csv file
  24. * @return Csv_Dialect A {@link Csv_Dialect} object with the appropriate settings
  25. * @access protected
  26. */
  27. public function detect($data) {
  28. $linefeed = $this->guessLinefeed($data);
  29. $data = rtrim($data, $linefeed);
  30. $count = count(explode($linefeed, $data));
  31. // threshold is ten, so add one to account for extra linefeed that is supposed to be at the end
  32. if ($count < 10) {
  33. throw new Csv_Exception_CannotDetermineDialect('You must provide at least ten lines in your sample data');
  34. }
  35. list($quote, $delim) = $this->guessQuoteAndDelim($data);
  36. if (!$quote) {
  37. $quote = '"';
  38. }
  39. if (is_null($delim)) {
  40. if (!$delim = $this->guessDelim($data, $linefeed, $quote)) {
  41. throw new Csv_Exception_CannotDetermineDialect('Csv_AutoDetect was unable to determine the file\'s dialect.');
  42. }
  43. }
  44. $dialect = new Csv_Dialect();
  45. $dialect->quotechar = $quote;
  46. $dialect->quoting = $this->guessQuotingStyle($data, $quote, $delim, $linefeed);
  47. $dialect->delimiter = $delim;
  48. $dialect->lineterminator = $linefeed;
  49. return $dialect;
  50. }
  51. /**
  52. * Determines if a csv sample has a header row - not 100% accurate by any means
  53. * It basically looks at each row in each column. If all but the first column are similar,
  54. * it likely has a header. The way we determine this is first by type, then by length
  55. * Other possible methods I could use to determine whether the first row is a header is I
  56. * could look to see if all but the first CONTAIN certain characters or something - think about this
  57. */
  58. public function hasHeader($data) {
  59. $reader = new Csv_Reader_String($data, $this->detect($data));
  60. list($has_headers, $checked, $types, $lengths, $total_lines, $headers) = array(0, 0, array(), array(), $reader->count(), $reader->getRow());
  61. if ($total_lines <= 2) {
  62. // please try again with a a larger file :)
  63. return false;
  64. }
  65. $total_columns = count($headers);
  66. foreach (range(0, $total_columns - 1) as $key => $col) $types[$col] = null;
  67. // loop through each remaining rows
  68. while ($row = $reader->current()) {
  69. // no need to check more than 20 lines
  70. if ($checked > 20) break; $checked++;
  71. $line = $reader->key();
  72. // loop through row and grab type for each column
  73. foreach ($row as $col => $val) {
  74. $types[$col][] = $this->getType($val);
  75. $lengths[$col][] = strlen($val);
  76. }
  77. $reader->next();
  78. }
  79. // now take a vote and if more than a certain threshold have a likely header, we'll return that we think it has a header
  80. foreach ($types as $key => $column) {
  81. $unique = array_unique($column);
  82. if (count($unique) == 1) { // if all are of the same type
  83. if ($unique[0] == $this->getType($headers[$key])) {
  84. // all rows type matched header type, so try length now
  85. $unique = array_unique($lengths[$key]);
  86. if (count($unique) == 1) {
  87. if ($unique[0] == strlen($headers[$key])) {
  88. $has_headers--;
  89. } else {
  90. $has_headers++;
  91. }
  92. }
  93. //printf ("%s is the same as %s<br>", $unique[0], $this->getType($headers[$key]));
  94. } else {
  95. $has_headers++;
  96. }
  97. }
  98. }
  99. return ($has_headers > 0);
  100. }
  101. /**
  102. * Since the reader returns all strings, this checks the type of the string for comparison
  103. * against header row in hasHeader()
  104. *
  105. * @access protected
  106. * @param string Value we're trying to detect the type of
  107. * @return string type of value
  108. * @todo A better way to do this would be to have Csv_Reader cast values to their correct type
  109. */
  110. protected function getType($value) {
  111. switch (true) {
  112. case ctype_digit($value):
  113. return "integer";
  114. case preg_match("/^[array()-9\.]$/i", $value, $matches):
  115. return "double";
  116. case ctype_alnum($value):
  117. default:
  118. return "string";
  119. }
  120. }
  121. /**
  122. * I copied this functionality from python's csv module. Basically, it looks
  123. * for text enclosed by identical quote characters which are in turn surrounded
  124. * by identical characters (the probable delimiter). If there is no quotes, the
  125. * delimiter cannot be determined this way.
  126. *
  127. * @param string A piece of sample data used to deduce the format of the csv file
  128. * @return array An array with the first value being the quote char and the second the delim
  129. * @access protected
  130. */
  131. protected function guessQuoteAndDelim($data) {
  132. $patterns = array();
  133. // delim can be anything but line breaks, quotes, or any type of spaces
  134. $delim = '([^\r\n\w"\'' . chr(32) . chr(30) . chr(160) . '])';
  135. $patterns[] = '/' . $delim . ' ?(["\']).*?(\2)(\1)/'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace
  136. $patterns[] = '/(?:^|\n)(["\']).*?(\1)' . $delim . ' ?/'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes
  137. $patterns[] = '/' . $delim . ' ?(["\']).*?(\2)(?:^|\n)/'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line
  138. $patterns[] = '/(?:^|\n)(["\']).*?(\2)(?:$|\n)/'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line
  139. foreach ($patterns as $pattern) {
  140. if ($nummatches = preg_match_all($pattern, $data, $matches)) {
  141. if ($matches) {
  142. break;
  143. }
  144. }
  145. }
  146. if (!$matches) {
  147. return array("", null); // couldn't guess quote or delim
  148. }
  149. $quotes = array_count_values($matches[2]);
  150. arsort($quotes);
  151. if ($quote = array_shift(array_flip($quotes))) {
  152. $delims = array_count_values($matches[1]);
  153. arsort($delims);
  154. $delim = array_shift(array_flip($delims));
  155. } else {
  156. $quote = ""; $delim = null;
  157. }
  158. return array($quote, $delim);
  159. }
  160. /**
  161. * Attempts to guess the delimiter of a set of data
  162. *
  163. * @param string The data you would like to get the delimiter of
  164. * @access protected
  165. * @return mixed If a delimiter can be found it is returned otherwise false is returned
  166. * @todo - understand what's going on here (I haven't yet had a chance to really look at it)
  167. */
  168. protected function guessDelim($data, $linefeed, $quotechar) {
  169. $charcount = count_chars($data, 1);
  170. $filtered = array();
  171. foreach ($charcount as $char => $count) {
  172. $chr = chr($char);
  173. // if delim is not the quote character and it is an allowed delimiter,
  174. // put it into the list of possible delim characters
  175. if (ord($quotechar) != $char && $this->isValidDelim($chr)) {
  176. $filtered[$char] = $count;
  177. }
  178. }
  179. // count every character on every line
  180. $data = explode($linefeed, $data);
  181. $tmp = array();
  182. $linecount = 0;
  183. foreach ($data as $row) {
  184. if (empty($row)) {
  185. continue;
  186. }
  187. // count non-empty lines
  188. $linecount++;
  189. // do a charcount on this line, but only remember the chars that
  190. // survived the filtering above
  191. $frequency = array_intersect_key(count_chars($row, 1), $filtered);
  192. // store the charcount along with the previous counts
  193. foreach ($frequency as $char => $count) {
  194. if (!array_key_exists($char, $tmp)) {
  195. $tmp[$char] = array();
  196. }
  197. $tmp[$char][] = $count; // this $char appears $count times on this line
  198. }
  199. }
  200. // a potential delimiter must be present on every non-empty line
  201. foreach ($tmp as $char=>$array) {
  202. if (count($array) < 0.98 * $linecount) {
  203. // ... so drop any delimiters that aren't
  204. unset($tmp[$char]);
  205. }
  206. }
  207. foreach ($tmp as $char => $array) {
  208. // a delimiter is very likely to occur the same amount of times on every line,
  209. // so drop delimiters that have too much variation in their frequency
  210. $dev = $this->deviation($array);
  211. if ($dev > 0.5) { // threshold not scientifically determined or something
  212. unset($tmp[$char]);
  213. continue;
  214. }
  215. // calculate average number of appearances
  216. $tmp[$char] = array_sum($tmp[$char]) / count($tmp[$char]);
  217. }
  218. // now, prefer the delimiter with the highest average number of appearances
  219. if (count($tmp) > 0) {
  220. asort($tmp);
  221. $delim = chr(end(array_keys($tmp)));
  222. } else {
  223. // no potential delimiters remain
  224. $delim = false;
  225. }
  226. return $delim;
  227. }
  228. /**
  229. * @todo Clean this up, this is hideous...
  230. */
  231. protected function isValidDelim($char) {
  232. $ord = ord($char);
  233. if ($char == chr(32) || $char == chr(30) || $char == chr(160)) {
  234. // exclude spaces of any kind...
  235. return false;
  236. }
  237. if ($ord >= ord("a") && $ord <= ord("z")) {
  238. // exclude a-z
  239. return false;
  240. }
  241. if ($ord >= ord("A") && $ord <= ord("Z")) {
  242. // exclude A-Z
  243. return false;
  244. }
  245. if ($ord >= ord("0") && $ord <= ord("9")) {
  246. // exclude 0-9
  247. return false;
  248. }
  249. if ($ord == ord("\n") || $ord == ord("\r")) {
  250. // exclude linefeeds
  251. return false;
  252. }
  253. return true;
  254. }
  255. /**
  256. * @todo - understand what's going on here (I haven't yet had a chance to really look at it)
  257. */
  258. protected function deviation ($array){
  259. $avg = array_sum($array) / count($array);
  260. foreach ($array as $value) {
  261. $variance[] = pow($value - $avg, 2);
  262. }
  263. $deviation = sqrt(array_sum($variance) / count($variance));
  264. return $deviation;
  265. }
  266. /**
  267. * Guess what the line feed character is, default to CR/LF
  268. * @access protected
  269. * @return string The line feed character(s)
  270. * @param $data string The raw CSV data
  271. * @todo - maybe rewrite this? it seems to be not working every time
  272. */
  273. protected function guessLinefeed($data) {
  274. $charcount = count_chars($data);
  275. $cr = "\r";
  276. $lf = "\n";
  277. $count_cr = $charcount[ord($cr)];
  278. $count_lf = $charcount[ord($lf)];
  279. if ($count_cr == $count_lf) {
  280. return "$cr$lf";
  281. }
  282. if ($count_cr == 0 && $count_lf > 0) {
  283. return "$lf";
  284. }
  285. if ($count_lf == 0 && $count_cr > 0) {
  286. return "$cr";
  287. }
  288. // sane default: cr+lf
  289. return "$cr$lf";
  290. }
  291. /**
  292. * Guess what the quoting style is, default to none
  293. * @access protected
  294. * @return integer (quoting style constant qCal_Dialect::QUOTE_NONE)
  295. * @param $data string The raw CSV data
  296. * @param $quote string The quote character
  297. * @param $delim string The delimiter character
  298. * @param $linefeed string The line feed character
  299. */
  300. protected function guessQuotingStyle($data, $quote, $delim, $linefeed) {
  301. $dialect = new Csv_Dialect();
  302. $dialect->delimiter = $delim;
  303. $dialect->quotechar = $quote;
  304. $dialect->lineterminator = $linefeed;
  305. $lines = explode($linefeed, $data);
  306. $lines_processed = 0;
  307. $reader = new Csv_Reader_String($data, $dialect);
  308. $quotingstyle_count = array();
  309. foreach ($reader as $parsedline) {
  310. do {
  311. // fetch next line until a non-empty line is found
  312. $line = array_shift($lines);
  313. } while (strlen($line) == 0);
  314. // how many quotes are present in the raw line?
  315. $quote_count = substr_count($line, $quote);
  316. // how many quotes are within the data?
  317. $quotecount_in_data = substr_count(implode("", $parsedline), $quote);
  318. // how many columns are in this line?
  319. $column_count = count($parsedline);
  320. // how many nonnumeric columns are in this line?
  321. // how many special char columns are in this line?
  322. $nonnumeric_count = 0;
  323. foreach ($parsedline as $column) {
  324. if (preg_match('/[^0-9]/', $column)) {
  325. $nonnumeric_count++;
  326. }
  327. }
  328. // default quoting style for this line: QUOTE_NONE
  329. $quotingstyle = Csv_Dialect::QUOTE_NONE;
  330. // determine this line's quoting style
  331. if ($quote_count == 0 || $quote_count <= $quotecount_in_data) {
  332. // there are no quotes, or there are less quotes than the number of quotes in the data
  333. $quotingstyle = Csv_Dialect::QUOTE_NONE;
  334. } elseif ($quote_count >= ($column_count * 2)) {
  335. // the number of quotes is larger than, or equal to, the number of quotes
  336. // necessary to quote each column
  337. $quotingstyle = Csv_Dialect::QUOTE_ALL;
  338. } elseif ($quote_count >= $quotecount_in_data) {
  339. // there are more quotes than the number of quotes in the data
  340. $quotingstyle = Csv_Dialect::QUOTE_MINIMAL;
  341. // determine if the number of nonnumeric columns times two is equal to
  342. // the number of quotes minus the number of quotes in the data
  343. if (($nonnumeric_count * 2) == ($quote_count - $quotecount_in_data)) {
  344. $quotingstyle = Csv_Dialect::QUOTE_NONNUMERIC;
  345. }
  346. }
  347. if (!array_key_exists($quotingstyle, $quotingstyle_count)) {
  348. $quotingstyle_count[$quotingstyle] = 0;
  349. }
  350. $quotingstyle_count[$quotingstyle]++;
  351. $lines_processed++;
  352. if ($lines_processed > 15) {
  353. // don't process the whole file - stop processing after fifteen lines
  354. break;
  355. }
  356. }
  357. // return the quoting style that was used most often
  358. asort($quotingstyle_count);
  359. $guess = end(array_keys($quotingstyle_count));
  360. return $guess;
  361. }
  362. }