PageRenderTime 24ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/phpspreadsheet/vendor/phpoffice/phpspreadsheet/src/PhpSpreadsheet/Reader/Csv/Delimiter.php

https://bitbucket.org/moodle/moodle
PHP | 151 lines | 95 code | 31 blank | 25 comment | 8 complexity | 7767b1ba1bdd5a9319bf7ae9cded787b MD5 | raw file
Possible License(s): Apache-2.0, LGPL-2.1, BSD-3-Clause, MIT, GPL-3.0
  1. <?php
  2. namespace PhpOffice\PhpSpreadsheet\Reader\Csv;
  3. class Delimiter
  4. {
  5. protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
  6. /** @var resource */
  7. protected $fileHandle;
  8. /** @var string */
  9. protected $escapeCharacter;
  10. /** @var string */
  11. protected $enclosure;
  12. /** @var array */
  13. protected $counts = [];
  14. /** @var int */
  15. protected $numberLines = 0;
  16. /** @var ?string */
  17. protected $delimiter;
  18. /**
  19. * @param resource $fileHandle
  20. */
  21. public function __construct($fileHandle, string $escapeCharacter, string $enclosure)
  22. {
  23. $this->fileHandle = $fileHandle;
  24. $this->escapeCharacter = $escapeCharacter;
  25. $this->enclosure = $enclosure;
  26. $this->countPotentialDelimiters();
  27. }
  28. public function getDefaultDelimiter(): string
  29. {
  30. return self::POTENTIAL_DELIMETERS[0];
  31. }
  32. public function linesCounted(): int
  33. {
  34. return $this->numberLines;
  35. }
  36. protected function countPotentialDelimiters(): void
  37. {
  38. $this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
  39. $delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
  40. // Count how many times each of the potential delimiters appears in each line
  41. $this->numberLines = 0;
  42. while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
  43. $this->countDelimiterValues($line, $delimiterKeys);
  44. }
  45. }
  46. protected function countDelimiterValues(string $line, array $delimiterKeys): void
  47. {
  48. $splitString = str_split($line, 1);
  49. if (is_array($splitString)) {
  50. $distribution = array_count_values($splitString);
  51. $countLine = array_intersect_key($distribution, $delimiterKeys);
  52. foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
  53. $this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
  54. }
  55. }
  56. }
  57. public function infer(): ?string
  58. {
  59. // Calculate the mean square deviations for each delimiter
  60. // (ignoring delimiters that haven't been found consistently)
  61. $meanSquareDeviations = [];
  62. $middleIdx = floor(($this->numberLines - 1) / 2);
  63. foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
  64. $series = $this->counts[$delimiter];
  65. sort($series);
  66. $median = ($this->numberLines % 2)
  67. ? $series[$middleIdx]
  68. : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
  69. if ($median === 0) {
  70. continue;
  71. }
  72. $meanSquareDeviations[$delimiter] = array_reduce(
  73. $series,
  74. function ($sum, $value) use ($median) {
  75. return $sum + ($value - $median) ** 2;
  76. }
  77. ) / count($series);
  78. }
  79. // ... and pick the delimiter with the smallest mean square deviation
  80. // (in case of ties, the order in potentialDelimiters is respected)
  81. $min = INF;
  82. foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
  83. if (!isset($meanSquareDeviations[$delimiter])) {
  84. continue;
  85. }
  86. if ($meanSquareDeviations[$delimiter] < $min) {
  87. $min = $meanSquareDeviations[$delimiter];
  88. $this->delimiter = $delimiter;
  89. }
  90. }
  91. return $this->delimiter;
  92. }
  93. /**
  94. * Get the next full line from the file.
  95. *
  96. * @return false|string
  97. */
  98. public function getNextLine()
  99. {
  100. $line = '';
  101. $enclosure = ($this->escapeCharacter === '' ? ''
  102. : ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
  103. . preg_quote($this->enclosure, '/');
  104. do {
  105. // Get the next line in the file
  106. $newLine = fgets($this->fileHandle);
  107. // Return false if there is no next line
  108. if ($newLine === false) {
  109. return false;
  110. }
  111. // Add the new line to the line passed in
  112. $line = $line . $newLine;
  113. // Drop everything that is enclosed to avoid counting false positives in enclosures
  114. $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
  115. // See if we have any enclosures left in the line
  116. // if we still have an enclosure then we need to read the next line as well
  117. } while (preg_match('/(' . $enclosure . ')/', $line ?? '') > 0);
  118. return $line ?? false;
  119. }
  120. }