/Classes/TYPO3/FLOW3/Utility/Unicode/TextIterator.php

https://github.com/christianjul/FLOW3-Composer · PHP · 437 lines · 227 code · 55 blank · 155 comment · 40 complexity · 1e872ffa9f60fc37ef5e9aebcfd90a65 MD5 · raw file

  1. <?php
  2. namespace TYPO3\FLOW3\Utility\Unicode;
  3. /* *
  4. * This script belongs to the FLOW3 package "PHP6". *
  5. * *
  6. * It is free software; you can redistribute it and/or modify it under *
  7. * the terms of the GNU Lesser General Public License, either version 3 *
  8. * of the License, or (at your option) any later version. *
  9. * *
  10. * The TYPO3 project - inspiring people to share! *
  11. * */
  12. use TYPO3\FLOW3\Annotations as FLOW3;
  13. /**
  14. * A PHP-based port of PHP6's built in TextIterator
  15. *
  16. * @FLOW3\Scope("singleton")
  17. */
  18. class TextIterator implements \Iterator {
  19. const
  20. CODE_POINT = 1,
  21. COMB_SEQUENCE = 2,
  22. CHARACTER = 3,
  23. WORD = 4,
  24. LINE = 5,
  25. SENTENCE = 6,
  26. DONE = 'DONE',
  27. WORD_NONE = 'WORD_NONE',
  28. WORD_NONE_LIMIT = 'WORD_NONE_LIMIT',
  29. WORD_NUMBER = 'WORD_NUMBER',
  30. WORD_NUMBER_LIMIT = 'WORD_NUMBER_LIMIT',
  31. WORD_LETTER = 'WORD_LETTER',
  32. WORD_LETTER_LIMIT = 'WORD_LETTER_LIMIT',
  33. WORD_KANA = 'WORD_KANA',
  34. WORD_KANA_LIMIT = 'WORD_KANA_LIMIT',
  35. LINE_SOFT = 'LINE_SOFT',
  36. LINE_SOFT_LIMIT = 'LINE_SOFT_LIMIT',
  37. LINE_HARD = 'LINE_HARD',
  38. LINE_HARD_LIMIT = 'LINE_HARD_LIMIT',
  39. SENTENCE_TERM = 'SENTENCE_TERM',
  40. SENTENCE_TERM_LIMIT = 'SENTENCE_TERM_LIMIT',
  41. SENTENCE_SEP = 'SENTENCE_SEP',
  42. SENTENCE_SEP_LIMIT = 'SENTENCE_SEP_LIMIT',
  43. REGEXP_SENTENCE_DELIMITERS = '[\.|,|!|\?|;]';
  44. /**
  45. * @var integer
  46. */
  47. protected $iteratorType;
  48. /**
  49. * @var string
  50. */
  51. protected $subject;
  52. /**
  53. * @var integer
  54. */
  55. protected $currentPosition;
  56. /**
  57. * @var \ArrayObject
  58. */
  59. protected $iteratorCache;
  60. /**
  61. * @var \ArrayIterator
  62. */
  63. protected $iteratorCacheIterator;
  64. /**
  65. * @var \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement
  66. */
  67. protected $previousElement;
  68. /**
  69. * Constructs the TextIterator
  70. *
  71. * @param string $subject
  72. * @param integer $iteratorType The type of iterator
  73. * @throws \TYPO3\FLOW3\Error\Exception
  74. */
  75. public function __construct($subject, $iteratorType = self::CHARACTER) {
  76. if ($iteratorType < 1 || $iteratorType > 6) throw new \TYPO3\FLOW3\Error\Exception('Fatal error: Invalid iterator type in TextIterator constructor', 1210849014);
  77. $this->iteratorType = $iteratorType;
  78. $this->subject = (string)$subject;
  79. $this->currentPosition = 0;
  80. $this->iteratorCache = new \ArrayObject();
  81. $this->iteratorCacheIterator = $this->iteratorCache->getIterator();
  82. $this->generateIteratorElements();
  83. $this->iteratorCacheIterator->rewind();
  84. $this->previousElement = $this->iteratorCacheIterator->current();
  85. }
  86. /**
  87. * Returns the current element
  88. *
  89. * @return string The value of the current element
  90. */
  91. public function current() {
  92. return $this->getCurrentElement()->getValue();
  93. }
  94. /**
  95. * Advances the iterator to the next element
  96. *
  97. * @return void
  98. */
  99. public function next() {
  100. $this->previousElement = $this->getCurrentElement();
  101. $this->iteratorCacheIterator->next();
  102. }
  103. /**
  104. * Returns the key of the current element. That means the number of the
  105. * current element starting with 0.
  106. *
  107. * @return mixed Key (number) of the current element
  108. */
  109. public function key() {
  110. return $this->iteratorCacheIterator->key();
  111. }
  112. /**
  113. * Returns true, if the current element is not the end of the iterator
  114. *
  115. * @return boolean True if the iterator has not reached it's end
  116. */
  117. public function valid() {
  118. if ($this->getCurrentElement()->getValue() != self::DONE && $this->getCurrentElement()->getOffset() != -1) return TRUE;
  119. return FALSE;
  120. }
  121. /**
  122. * Sets the iterator back to the first element
  123. *
  124. * @return void
  125. */
  126. public function rewind() {
  127. $this->iteratorCacheIterator->rewind();
  128. }
  129. /**
  130. * Returns the offset in the original given string of the current element
  131. *
  132. * @return integer The offset of the current element
  133. */
  134. public function offset() {
  135. return $this->getCurrentElement()->getOffset();
  136. }
  137. /**
  138. * Returns the previous element
  139. *
  140. * @return string The previous element of the iterator
  141. */
  142. public function previous() {
  143. return $this->previousElement->getValue();
  144. }
  145. /**
  146. * Returns the last element of the iterator
  147. *
  148. * @return string the last element of the iterator
  149. */
  150. public function last() {
  151. $this->rewind();
  152. $previousElement = $this->getCurrentElement();
  153. while ($this->valid()) {
  154. $previousElement = $this->getCurrentElement();
  155. $this->next();
  156. }
  157. return $previousElement->getValue();
  158. }
  159. /**
  160. * Returns the next elment following the character of the original string
  161. * given by its offset
  162. *
  163. * @param integer $offset The offset of the character
  164. * @return string The element following this character
  165. */
  166. public function following($offset) {
  167. $this->rewind();
  168. while ($this->valid()) {
  169. $this->next();
  170. $nextElement = $this->getCurrentElement();
  171. if ($nextElement->getOffset() >= $offset) return $nextElement->getOffset();
  172. }
  173. return $this->offset();
  174. }
  175. /**
  176. * Returns the element preceding the character of the original string given by its offset
  177. *
  178. * @param integer $offset The offset of the character
  179. * @return string The element preceding this character
  180. */
  181. public function preceding($offset) {
  182. $this->rewind();
  183. while ($this->valid()) {
  184. $previousElement = $this->getCurrentElement();
  185. $this->next();
  186. $currentElement = $this->getCurrentElement();
  187. if (($currentElement->getOffset() + $currentElement->getLength()) >= $offset) {
  188. return $previousElement->getOffset() + $previousElement->getLength();
  189. }
  190. }
  191. return $currentElement->getOffset() + $currentElement->getLength();
  192. }
  193. /**
  194. * Returns true if the current element is a boundary element.
  195. *
  196. * Boundaries are:
  197. * CHARACTER: none
  198. * WORD: <space>.,!?;
  199. * SENTENCE: .,!?;
  200. * LINE: <\n>
  201. *
  202. * @return boolean True if the current element is a boundary element
  203. */
  204. public function isBoundary() {
  205. return $this->getCurrentElement()->isBoundary();
  206. }
  207. /**
  208. * Returns all elements of the iterator in an array
  209. *
  210. * @return array All elements of the iterator
  211. */
  212. public function getAll() {
  213. $this->rewind();
  214. $allValues = array();
  215. while ($this->valid()) {
  216. $allValues[] = $this->getCurrentElement()->getValue();
  217. $this->next();
  218. }
  219. return $allValues;
  220. }
  221. /**
  222. * @throws UnsupportedFeatureException
  223. */
  224. public function getRuleStatus() {
  225. throw new \TYPO3\FLOW3\Utility\Unicode\UnsupportedFeatureException('getRuleStatus() is not supported.', 1210849057);
  226. }
  227. /**
  228. * @throws UnsupportedFeatureException
  229. */
  230. public function getRuleStatusArray() {
  231. throw new \TYPO3\FLOW3\Utility\Unicode\UnsupportedFeatureException('getRuleStatusArray() is not supported.', 1210849076);
  232. }
  233. /**
  234. * @throws UnsupportedFeatureException
  235. */
  236. public function getAvailableLocales() {
  237. throw new \TYPO3\FLOW3\Utility\Unicode\UnsupportedFeatureException('getAvailableLocales() is not supported.', 1210849105);
  238. }
  239. /**
  240. * Returns the first element
  241. *
  242. * @return string The first element of the iterator
  243. */
  244. public function first() {
  245. $this->rewind();
  246. return $this->getCurrentElement()->getValue();
  247. }
  248. /**
  249. * Helper function to coordinate the "string splitting"
  250. *
  251. * @return void
  252. * @throws UnsupportedFeatureException
  253. */
  254. private function generateIteratorElements() {
  255. if ($this->subject == '') {
  256. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement(self::DONE, -1));
  257. return;
  258. }
  259. if ($this->iteratorType == self::CODE_POINT) throw new \TYPO3\FLOW3\Utility\Unicode\UnsupportedFeatureException('Unsupported iterator type.', 1210849150);
  260. elseif ($this->iteratorType == self::COMB_SEQUENCE)throw new \TYPO3\FLOW3\Utility\Unicode\UnsupportedFeatureException('Unsupported iterator type.', 1210849151);
  261. elseif ($this->iteratorType == self::CHARACTER) $this->parseSubjectByCharacter();
  262. elseif ($this->iteratorType == self::WORD) $this->parseSubjectByWord();
  263. elseif ($this->iteratorType == self::LINE) $this->parseSubjectByLine();
  264. elseif ($this->iteratorType == self::SENTENCE) $this->parseSubjectBySentence();
  265. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement(self::DONE, -1));
  266. }
  267. /**
  268. * Helper function to do the splitting by character
  269. *
  270. */
  271. private function parseSubjectByCharacter() {
  272. $i = 0;
  273. foreach (preg_split('//u', $this->subject) as $currentCharacter) {
  274. if ($currentCharacter == '') continue;
  275. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($currentCharacter, $i, 1, FALSE));
  276. $i++;
  277. }
  278. }
  279. /**
  280. * Helper function to do the splitting by word. Note: punctuation marks are
  281. * treated as words, spaces as boundary elements
  282. *
  283. */
  284. private function parseSubjectByWord() {
  285. $i = 0;
  286. $isFirstIteration = TRUE;
  287. foreach (explode(' ', $this->subject) as $currentWord) {
  288. $delimitersMatches = array();
  289. $haveProcessedCurrentWord = FALSE;
  290. if (preg_match_all('/' . self::REGEXP_SENTENCE_DELIMITERS . '/', $currentWord, $delimitersMatches)) {
  291. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement(' ', $i, 1, TRUE));
  292. $j = 0;
  293. $splittedWord = preg_split('/' . self::REGEXP_SENTENCE_DELIMITERS . '/', $currentWord);
  294. foreach ($splittedWord as $currentPart) {
  295. if ($currentPart != '') {
  296. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($currentPart, $i, \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentPart), FALSE));
  297. $i += \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentPart);
  298. }
  299. if ($j < count($delimitersMatches[0])) $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($delimitersMatches[0][$j], $i, 1, TRUE));
  300. $i++;
  301. $j++;
  302. }
  303. $haveProcessedCurrentWord = TRUE;
  304. }
  305. if (!$isFirstIteration && !$haveProcessedCurrentWord) {
  306. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement(' ', $i, 1, TRUE));
  307. $i++;
  308. } else {
  309. $isFirstIteration = FALSE;
  310. }
  311. if (!$haveProcessedCurrentWord) {
  312. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($currentWord, $i, \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentWord), FALSE));
  313. $i += \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentWord);
  314. }
  315. unset($delimitersMatches);
  316. }
  317. }
  318. /**
  319. * Helper function to do the splitting by line. Note: one punctuations mark
  320. * belongs to the preceding sentence.
  321. * "\n" is boundary element.
  322. *
  323. */
  324. private function parseSubjectByLine() {
  325. $i = 0;
  326. $j = 0;
  327. $lines = explode("\n", $this->subject);
  328. foreach ($lines as $currentLine) {
  329. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($currentLine, $i, \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentLine), FALSE));
  330. $i += \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentLine);
  331. if (count($lines) - 1 > $j) {
  332. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement("\n", $i, 1, TRUE));
  333. $i++;
  334. }
  335. $j++;
  336. }
  337. }
  338. /**
  339. * Helper function to do the splitting by sentence. Note: one punctuations
  340. * mark belongs to the preceding sentence. Whitespace between sentences is
  341. * marked as boundary.
  342. *
  343. */
  344. private function parseSubjectBySentence() {
  345. $i = 0;
  346. $j = 0;
  347. $count = 0;
  348. $delimitersMatches = array();
  349. preg_match_all('/' . self::REGEXP_SENTENCE_DELIMITERS . '/', $this->subject, $delimitersMatches);
  350. $splittedSentence = preg_split('/' . self::REGEXP_SENTENCE_DELIMITERS . '/', $this->subject);
  351. if (count($splittedSentence) == 1) {
  352. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($splittedSentence[0], 0, \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($splittedSentence[0]), FALSE));
  353. return;
  354. }
  355. foreach ($splittedSentence as $currentPart) {
  356. $currentPart = preg_replace('/^\s|\s$/', '', $currentPart, -1, $count);
  357. $whiteSpace = '';
  358. for ($k = 0; $k < $count; $k++) $whiteSpace .= ' ';
  359. if ($whiteSpace != '') $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($whiteSpace, $i, $count, TRUE));
  360. $i += $count;
  361. if ($currentPart != '' && $j < count($delimitersMatches[0])) {
  362. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($currentPart . $delimitersMatches[0][$j], $i, \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentPart . $delimitersMatches[0][$j]), FALSE));
  363. $i += \TYPO3\FLOW3\Utility\Unicode\Functions::strlen($currentPart . $delimitersMatches[0][$j]);
  364. $j++;
  365. }
  366. elseif ($j < count($delimitersMatches[0])) {
  367. $this->iteratorCache->append(new \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement($delimitersMatches[0][$j], $i, 1, TRUE));
  368. $i++;
  369. $j++;
  370. }
  371. }
  372. }
  373. /**
  374. * Helper function to get the current element from the cache.
  375. *
  376. * @return \TYPO3\FLOW3\Utility\Unicode\TextIteratorElement The current element of the cache
  377. */
  378. private function getCurrentElement() {
  379. return $this->iteratorCacheIterator->current();
  380. }
  381. }
  382. ?>