PageRenderTime 24ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/sites/all/modules/service_container/lib/Drupal/Component/Transliteration/PhpTransliteration.php

https://gitlab.com/leoplanxxi/dr7-web-buap-2016
PHP | 288 lines | 106 code | 29 blank | 153 comment | 30 complexity | 327ca97624a0b95b9818cf4c8d0e38df MD5 | raw file
  1. <?php
  2. /**
  3. * @file
  4. * Definition of \Drupal\Component\Transliteration\PhpTransliteration.
  5. *
  6. * Some parts of this code were derived from the MediaWiki project's UtfNormal
  7. * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
  8. * http://www.mediawiki.org/
  9. */
  10. namespace Drupal\Component\Transliteration;
  11. /**
  12. * Implements transliteration without using the PECL extensions.
  13. *
  14. * Transliterations are done character-by-character, by looking up non-US-ASCII
  15. * characters in a transliteration database.
  16. *
  17. * The database comes from two types of files, both of which are searched for in
  18. * the PhpTransliteration::$dataDirectory directory. First, language-specific
  19. * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
  20. * there is no language-specific override for a character, the generic
  21. * transliteration character tables are searched (see
  22. * PhpTransliteration::readGenericData()). If looking up the character in the
  23. * generic table results in a NULL value, or an illegal character is
  24. * encountered, then a substitute character is returned.
  25. */
  26. class PhpTransliteration implements TransliterationInterface {
  27. /**
  28. * Directory where data for transliteration resides.
  29. *
  30. * The constructor sets this (by default) to subdirectory 'data' underneath
  31. * the directory where the class's PHP file resides.
  32. *
  33. * @var string
  34. */
  35. protected $dataDirectory;
  36. /**
  37. * Associative array of language-specific character transliteration tables.
  38. *
  39. * The outermost array keys are language codes. For each language code key,
  40. * the value is an array whose keys are Unicode character codes, and whose
  41. * values are the transliterations of those characters to US-ASCII. This is
  42. * set up as needed in PhpTransliteration::replace() by calling
  43. * PhpTransliteration::readLanguageOverrides().
  44. *
  45. * @var array
  46. */
  47. protected $languageOverrides = array();
  48. /**
  49. * Non-language-specific transliteration tables.
  50. *
  51. * Array whose keys are the upper two bytes of the Unicode character, and
  52. * whose values are an array of transliterations for each lower-two bytes
  53. * character code. This is set up as needed in PhpTransliteration::replace()
  54. * by calling PhpTransliteration::readGenericData().
  55. *
  56. * @var array
  57. */
  58. protected $genericMap = array();
  59. /**
  60. * Constructs a transliteration object.
  61. *
  62. * @param string $data_directory
  63. * (optional) The directory where data files reside. If omitted, defaults
  64. * to subdirectory 'data' underneath the directory where the class's PHP
  65. * file resides.
  66. */
  67. public function __construct($data_directory = NULL) {
  68. $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
  69. }
  70. /**
  71. * {@inheritdoc}
  72. */
  73. public function removeDiacritics($string) {
  74. $result = '';
  75. foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  76. $code = self::ordUTF8($character);
  77. // These two Unicode ranges include the accented US-ASCII letters, with a
  78. // few characters that aren't accented letters mixed in. So define the
  79. // ranges and the excluded characters.
  80. $range1 = $code > 0x00bf && $code < 0x017f;
  81. $exclusions_range1 = array(0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b);
  82. $range2 = $code > 0x01cc && $code < 0x0250;
  83. $exclusions_range2 = array(0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245);
  84. $replacement = $character;
  85. if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
  86. $to_add = $this->lookupReplacement($code, 'xyz');
  87. if(strlen($to_add) === 1) {
  88. $replacement = $to_add;
  89. }
  90. }
  91. $result .= $replacement;
  92. }
  93. return $result;
  94. }
  95. /**
  96. * {@inheritdoc}
  97. */
  98. public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
  99. $result = '';
  100. $length = 0;
  101. // Split into Unicode characters and transliterate each one.
  102. foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  103. $code = self::ordUTF8($character);
  104. if ($code == -1) {
  105. $to_add = $unknown_character;
  106. }
  107. else {
  108. $to_add = $this->replace($code, $langcode, $unknown_character);
  109. }
  110. // Check if this exceeds the maximum allowed length.
  111. if (isset($max_length)) {
  112. $length += strlen($to_add);
  113. if ($length > $max_length) {
  114. // There is no more space.
  115. return $result;
  116. }
  117. }
  118. $result .= $to_add;
  119. }
  120. return $result;
  121. }
  122. /**
  123. * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
  124. *
  125. * @param string $character
  126. * A single UTF-8 character.
  127. *
  128. * @return int
  129. * The character code, or -1 if an illegal character is found.
  130. */
  131. protected static function ordUTF8($character) {
  132. $first_byte = ord($character[0]);
  133. if (($first_byte & 0x80) == 0) {
  134. // Single-byte form: 0xxxxxxxx.
  135. return $first_byte;
  136. }
  137. if (($first_byte & 0xe0) == 0xc0) {
  138. // Two-byte form: 110xxxxx 10xxxxxx.
  139. return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
  140. }
  141. if (($first_byte & 0xf0) == 0xe0) {
  142. // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
  143. return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
  144. }
  145. if (($first_byte & 0xf8) == 0xf0) {
  146. // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
  147. return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
  148. }
  149. // Other forms are not legal.
  150. return -1;
  151. }
  152. /**
  153. * Replaces a single Unicode character using the transliteration database.
  154. *
  155. * @param int $code
  156. * The character code of a Unicode character.
  157. * @param string $langcode
  158. * The language code of the language the character is in.
  159. * @param string $unknown_character
  160. * The character to substitute for characters without transliterated
  161. * equivalents.
  162. *
  163. * @return string
  164. * US-ASCII replacement character. If it has a mapping, it is returned;
  165. * otherwise, $unknown_character is returned. The replacement can contain
  166. * multiple characters.
  167. */
  168. protected function replace($code, $langcode, $unknown_character) {
  169. if ($code < 0x80) {
  170. // Already lower ASCII.
  171. return chr($code);
  172. }
  173. // See if there is a language-specific override for this character.
  174. if (!isset($this->languageOverrides[$langcode])) {
  175. $this->readLanguageOverrides($langcode);
  176. }
  177. if (isset($this->languageOverrides[$langcode][$code])) {
  178. return $this->languageOverrides[$langcode][$code];
  179. }
  180. return $this->lookupReplacement($code, $unknown_character);
  181. }
  182. /**
  183. * Look up the generic replacement for a UTF-8 character code.
  184. *
  185. * @param $code
  186. * The UTF-8 character code.
  187. * @param string $unknown_character
  188. * (optional) The character to substitute for characters without entries in
  189. * the replacement tables.
  190. *
  191. * @return string
  192. * US-ASCII replacement characters. If it has a mapping, it is returned;
  193. * otherwise, $unknown_character is returned. The replacement can contain
  194. * multiple characters.
  195. */
  196. protected function lookupReplacement($code, $unknown_character = '?') {
  197. // See if there is a generic mapping for this character.
  198. $bank = $code >> 8;
  199. if (!isset($this->genericMap[$bank])) {
  200. $this->readGenericData($bank);
  201. }
  202. $code = $code & 0xff;
  203. return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
  204. }
  205. /**
  206. * Reads in language overrides for a language code.
  207. *
  208. * The data is read from files named "$langcode.php" in
  209. * PhpTransliteration::$dataDirectory. These files should set up an array
  210. * variable $overrides with an element whose key is $langcode and whose value
  211. * is an array whose keys are character codes, and whose values are their
  212. * transliterations in this language. The character codes can be for any valid
  213. * Unicode character, independent of the number of bytes.
  214. *
  215. * @param $langcode
  216. * Code for the language to read.
  217. */
  218. protected function readLanguageOverrides($langcode) {
  219. // Figure out the file name to use by sanitizing the language code,
  220. // just in case.
  221. $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
  222. // Read in this file, which should set up a variable called $overrides,
  223. // which will be local to this function.
  224. if (is_file($file)) {
  225. include $file;
  226. }
  227. if (!isset($overrides) || !is_array($overrides)) {
  228. $overrides = array($langcode => array());
  229. }
  230. $this->languageOverrides[$langcode] = $overrides[$langcode];
  231. }
  232. /**
  233. * Reads in generic transliteration data for a bank of characters.
  234. *
  235. * The data is read in from a file named "x$bank.php" (with $bank in
  236. * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
  237. * should set up a variable $bank containing an array whose numerical indices
  238. * are the remaining two bytes of the character code, and whose values are the
  239. * transliterations of these characters into US-ASCII. Note that the maximum
  240. * Unicode character that can be encoded in this way is 4 bytes.
  241. *
  242. * @param $bank
  243. * First two bytes of the Unicode character, or 0 for the ASCII range.
  244. */
  245. protected function readGenericData($bank) {
  246. // Figure out the file name.
  247. $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
  248. // Read in this file, which should set up a variable called $base, which
  249. // will be local to this function.
  250. if (is_file($file)) {
  251. include $file;
  252. }
  253. if (!isset($base) || !is_array($base)) {
  254. $base = array();
  255. }
  256. // Save this data.
  257. $this->genericMap[$bank] = $base;
  258. }
  259. }