/lib/vendor/algo26-matthias/idna-convert/src/NamePrep/NamePrep.php

https://github.com/NavigateCMS/Navigate-CMS · PHP · 318 lines · 221 code · 36 blank · 61 comment · 50 complexity · 62a6c01ef88ace1d70298e8eac6f60c5 MD5 · raw file

  1. <?php
  2. namespace Algo26\IdnaConvert\NamePrep;
  3. use Algo26\IdnaConvert\Exception\InvalidCharacterException;
  4. use Algo26\IdnaConvert\Exception\InvalidIdnVersionException;
  5. class NamePrep implements NamePrepInterface
  6. {
  7. const sBase = 0xAC00;
  8. const lBase = 0x1100;
  9. const vBase = 0x1161;
  10. const tBase = 0x11A7;
  11. const lCount = 19;
  12. const vCount = 21;
  13. const tCount = 28;
  14. const nCount = 588; // vCount * tCount
  15. const sCount = 11172; // lCount * tCount * vCount
  16. const sLast = self::sBase + self::lCount * self::vCount * self::tCount;
  17. /** @var NamePrepDataInterface */
  18. private $namePrepData;
  19. /**
  20. * @param string|null $idnVersion
  21. *
  22. * @throws InvalidIdnVersionException
  23. */
  24. public function __construct(?string $idnVersion = null)
  25. {
  26. if ($idnVersion === null || $idnVersion == 2008) {
  27. $this->namePrepData = new NamePrepData2008();
  28. return;
  29. }
  30. if ($idnVersion == 2003) {
  31. $this->namePrepData = new NamePrepData2003();
  32. return;
  33. }
  34. throw new InvalidIdnVersionException('IDN version must bei either 2003 or 2008');
  35. }
  36. /**
  37. * @param array $inputArray
  38. *
  39. * @return array
  40. * @throws InvalidCharacterException
  41. */
  42. public function do(array $inputArray): array
  43. {
  44. $outputArray = $this->applyCharacterMaps($inputArray);
  45. $outputArray = $this->hangulCompose($outputArray);
  46. $outputArray = $this->combineCodePoints($outputArray);
  47. return $outputArray;
  48. }
  49. /**
  50. * @param array $inputArray
  51. *
  52. * @return array
  53. * @throws InvalidCharacterException
  54. */
  55. private function applyCharacterMaps(array $inputArray): array
  56. {
  57. $outputArray = [];
  58. foreach ($inputArray as $codePoint) {
  59. // Map to nothing == skip that code point
  60. if (in_array($codePoint, $this->namePrepData->mapToNothing)) {
  61. continue;
  62. }
  63. // Try to find prohibited input
  64. if (in_array($codePoint, $this->namePrepData->prohibit)
  65. || in_array($codePoint, $this->namePrepData->generalProhibited)
  66. ) {
  67. throw new InvalidCharacterException(sprintf('Prohibited input U+%08X', $codePoint), 101);
  68. }
  69. foreach ($this->namePrepData->prohibitRanges as $range) {
  70. if ($range[0] <= $codePoint && $codePoint <= $range[1]) {
  71. throw new InvalidCharacterException(sprintf('Prohibited input U+%08X', $codePoint), 102);
  72. }
  73. }
  74. if (0xAC00 <= $codePoint && $codePoint <= 0xD7AF) {
  75. // Hangul syllable decomposition
  76. foreach ($this->hangulDecompose($codePoint) as $decomposed) {
  77. $outputArray[] = (int) $decomposed;
  78. }
  79. } elseif (isset($this->namePrepData->replaceMaps[$codePoint])) {
  80. foreach ($this->applyCanonicalOrdering($this->namePrepData->replaceMaps[$codePoint]) as $reordered) {
  81. $outputArray[] = (int) $reordered;
  82. }
  83. } else {
  84. $outputArray[] = (int) $codePoint;
  85. }
  86. }
  87. return $outputArray;
  88. }
  89. private function combineCodePoints(array $codePoints): array
  90. {
  91. $previousClass = 0;
  92. $previousStarter = 0;
  93. $outputLength = count($codePoints);
  94. for ($outerIndex = 0; $outerIndex < $outputLength; ++$outerIndex) {
  95. $combiningClass = $this->getCombiningClass($codePoints[$outerIndex]);
  96. if (
  97. ($previousClass === 0 || $previousClass > $combiningClass)
  98. && $combiningClass !== 0
  99. ) {
  100. // Try to match
  101. $sequenceLength = $outerIndex - $previousStarter;
  102. $combined = $this->combine(array_slice($codePoints, $previousStarter, $sequenceLength));
  103. // On match: Replace the last starter with the composed character and remove
  104. // the now redundant non-starter(s)
  105. if (false !== $combined) {
  106. $codePoints[$previousStarter] = $combined;
  107. if ($sequenceLength > 1) {
  108. for ($innerIndex = $outerIndex + 1; $innerIndex < $outputLength; ++$innerIndex) {
  109. $codePoints[$innerIndex - 1] = $codePoints[$innerIndex];
  110. }
  111. unset($codePoints[$outputLength]);
  112. }
  113. // Rewind the for loop by one, since there can be more possible compositions
  114. $outerIndex--;
  115. $outputLength--;
  116. $previousClass = 0;
  117. if ($outerIndex !== $previousStarter) {
  118. $this->getCombiningClass($codePoints[$outerIndex - 1]);
  119. }
  120. continue;
  121. }
  122. }
  123. if ($combiningClass === 0) {
  124. $previousStarter = $outerIndex;
  125. }
  126. $previousClass = $combiningClass;
  127. }
  128. return $codePoints;
  129. }
  130. /**
  131. * Decomposes a Hangul syllable
  132. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  133. * @param integer 32bit UCS4 code point
  134. * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
  135. */
  136. private function hangulDecompose(int $codePoint): array
  137. {
  138. $sIndex = (int) $codePoint - self::sBase;
  139. if ($sIndex < 0 || $sIndex >= self::sCount) {
  140. return [$codePoint];
  141. }
  142. $result = [
  143. (int) self::lBase + $sIndex / self::nCount,
  144. (int) self::vBase + ($sIndex % self::nCount) / self::tCount,
  145. ];
  146. $T = intval(self::tBase + $sIndex % self::tCount);
  147. if ($T != self::tBase) {
  148. $result[] = $T;
  149. }
  150. return $result;
  151. }
  152. /**
  153. * Compose a Hangul syllable
  154. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  155. *
  156. * @param array $input Decomposed UCS4 sequence
  157. * @return array UCS4 sequence with syllables composed
  158. */
  159. private function hangulCompose(array $input): array
  160. {
  161. $inputLength = count($input);
  162. if ($inputLength === 0) {
  163. return [];
  164. }
  165. $previousCharCode = (int) $input[0];
  166. // copy first codepoint from input to output
  167. $result = [
  168. $previousCharCode,
  169. ];
  170. for ($i = 1; $i < $inputLength; ++$i) {
  171. $charCode = (int) $input[$i];
  172. $sIndex = $previousCharCode - self::sBase;
  173. $lIndex = $previousCharCode - self::lBase;
  174. $vIndex = $charCode - self::vBase;
  175. $tIndex = $charCode - self::tBase;
  176. // Find out, whether two current characters are LV and T
  177. if (0 <= $sIndex
  178. && $sIndex < self::sCount
  179. && ($sIndex % self::tCount == 0)
  180. && 0 <= $tIndex
  181. && $tIndex <= self::tCount
  182. ) {
  183. // create syllable of form LVT
  184. $previousCharCode += $tIndex;
  185. $result[(count($result) - 1)] = $previousCharCode; // reset last
  186. continue; // discard char
  187. }
  188. // Find out, whether two current characters form L and V
  189. if (0 <= $lIndex
  190. && $lIndex < self::lCount
  191. && 0 <= $vIndex
  192. && $vIndex < self::vCount
  193. ) {
  194. // create syllable of form LV
  195. $previousCharCode = (int) self::sBase + ($lIndex * self::vCount + $vIndex) * self::tCount;
  196. $result[(count($result) - 1)] = $previousCharCode; // reset last
  197. continue; // discard char
  198. }
  199. // if neither case was true, just add the character
  200. $previousCharCode = $charCode;
  201. $result[] = $charCode;
  202. }
  203. return $result;
  204. }
  205. /**
  206. * Returns the combining class of a certain wide char
  207. * @param integer $char Wide char to check (32bit integer)
  208. * @return integer Combining class if found, else 0
  209. */
  210. private function getCombiningClass(int $char): int
  211. {
  212. return isset($this->namePrepData->normalizeCombiningClasses[$char])
  213. ? $this->namePrepData->normalizeCombiningClasses[$char]
  214. : 0;
  215. }
  216. /**
  217. * Applies the canonical ordering of a decomposed UCS4 sequence
  218. * @param array $input Decomposed UCS4 sequence
  219. * @return array Ordered USC4 sequence
  220. */
  221. private function applyCanonicalOrdering(array $input): array
  222. {
  223. $needsSwapping = true;
  224. $inputLength = count($input);
  225. while ($needsSwapping) {
  226. $needsSwapping = false;
  227. $previousClass = $this->getCombiningClass(intval($input[0]));
  228. for ($outerIndex = 0; $outerIndex < $inputLength - 1; ++$outerIndex) {
  229. $nextClass = $this->getCombiningClass(intval($input[$outerIndex + 1]));
  230. if ($nextClass !== 0 && $previousClass > $nextClass) {
  231. // Move item leftward until it fits
  232. for ($innerIndex = $outerIndex + 1; $innerIndex > 0; --$innerIndex) {
  233. if ($this->getCombiningClass(intval($input[$innerIndex - 1])) <= $nextClass) {
  234. break;
  235. }
  236. $charToMove = intval($input[$innerIndex]);
  237. $input[$innerIndex] = intval($input[$innerIndex - 1]);
  238. $input[$innerIndex - 1] = $charToMove;
  239. $needsSwapping = true;
  240. }
  241. // Reentering the loop looking at the old character again
  242. $nextClass = $previousClass;
  243. }
  244. $previousClass = $nextClass;
  245. }
  246. }
  247. return $input;
  248. }
  249. /**
  250. * Do composition of a sequence of starter and non-starter
  251. * @param array $input UCS4 Decomposed sequence
  252. * @return array|false Ordered USC4 sequence
  253. */
  254. private function combine(array $input)
  255. {
  256. $inputLength = count($input);
  257. if (0 === $inputLength) {
  258. return false;
  259. }
  260. foreach ($this->namePrepData->replaceMaps as $namePrepSource => $namePrepTarget) {
  261. if ($namePrepTarget[0] !== $input[0]) {
  262. continue;
  263. }
  264. if (count($namePrepTarget) !== $inputLength) {
  265. continue;
  266. }
  267. $hit = false;
  268. foreach ($input as $k2 => $v2) {
  269. if ($v2 === $namePrepTarget[$k2]) {
  270. $hit = true;
  271. } else {
  272. $hit = false;
  273. break;
  274. }
  275. }
  276. if ($hit) {
  277. return $namePrepSource;
  278. }
  279. }
  280. return false;
  281. }
  282. }