PageRenderTime 46ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/application/third_party/ar-php/Arabic/CharsetD.php

https://bitbucket.org/machaven/limesurvey
PHP | 165 lines | 37 code | 10 blank | 118 comment | 2 complexity | 1dc9d26c562aa092abb4f8a3ab5aaaf0 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, BSD-3-Clause, GPL-3.0, LGPL-3.0
  1. <?php
  2. /**
  3. * ----------------------------------------------------------------------
  4. *
  5. * Copyright (c) 2006-2012 Khaled Al-Sham'aa.
  6. *
  7. * http://www.ar-php.org
  8. *
  9. * PHP Version 5
  10. *
  11. * ----------------------------------------------------------------------
  12. *
  13. * LICENSE
  14. *
  15. * This program is open source product; you can redistribute it and/or
  16. * modify it under the terms of the GNU Lesser General Public License (LGPL)
  17. * as published by the Free Software Foundation; either version 3
  18. * of the License, or (at your option) any later version.
  19. *
  20. * This program is distributed in the hope that it will be useful,
  21. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  22. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  23. * GNU Lesser General Public License for more details.
  24. *
  25. * You should have received a copy of the GNU Lesser General Public License
  26. * along with this program. If not, see <http://www.gnu.org/licenses/lgpl.txt>.
  27. *
  28. * ----------------------------------------------------------------------
  29. *
  30. * Class Name: Detect Arabic String Character Set
  31. *
  32. * Filename: CharsetD.php
  33. *
  34. * Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org>
  35. *
  36. * Purpose: This class will return Arabic character set that used for
  37. * a given Arabic string passing into this class, those available
  38. * character sets that can be detected by this class includes
  39. * the most popular three: Windows-1256, ISO 8859-6, and UTF-8.
  40. *
  41. * ----------------------------------------------------------------------
  42. *
  43. * Detect Arabic String Character Set
  44. *
  45. * The last step of the Information Retrieval process is to display the found
  46. * documents to the user. However, some difficulties might occur at that point.
  47. * English texts are usually written in the ASCII standard. Unlike the English
  48. * language, many languages have different character sets, and do not have one
  49. * standard. This plurality of standards causes problems, especially in a web
  50. * environment.
  51. *
  52. * This PHP class will return Arabic character set that used for a given
  53. * Arabic string passing into this class, those available character sets that can
  54. * be detected by this class includes the most popular three: Windows-1256,
  55. * ISO 8859-6, and UTF-8.
  56. *
  57. * Example:
  58. * <code>
  59. * include('./I18N/Arabic.php');
  60. * $obj = new I18N_Arabic('CharsetD');
  61. *
  62. * $charset = $obj->getCharset($text);
  63. * </code>
  64. *
  65. * @category I18N
  66. * @package I18N_Arabic
  67. * @author Khaled Al-Sham'aa <khaled@ar-php.org>
  68. * @copyright 2006-2012 Khaled Al-Sham'aa
  69. *
  70. * @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
  71. * @link http://www.ar-php.org
  72. */
  73. // New in PHP V5.3: Namespaces
  74. // namespace I18N\Arabic;
  75. //
  76. // $obj = new I18N\Arabic\CharsetD();
  77. //
  78. // use I18N\Arabic;
  79. // $obj = new Arabic\CharsetD();
  80. //
  81. // use I18N\Arabic\CharsetD as CharsetD;
  82. // $obj = new CharsetD();
  83. /**
  84. * This PHP class detect Arabic string character set
  85. *
  86. * @category I18N
  87. * @package I18N_Arabic
  88. * @author Khaled Al-Sham'aa <khaled@ar-php.org>
  89. * @copyright 2006-2012 Khaled Al-Sham'aa
  90. *
  91. * @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
  92. * @link http://www.ar-php.org
  93. */
  94. class I18N_Arabic_CharsetD
  95. {
  96. /**
  97. * Loads initialize values
  98. *
  99. * @ignore
  100. */
  101. public function __construct()
  102. {
  103. }
  104. /**
  105. * Count number of hits for the most frequented letters in Arabic language
  106. * (Alef, Lam and Yaa), then calculate association ratio with each of
  107. * possible character set (UTF-8, Windows-1256 and ISO-8859-6)
  108. *
  109. * @param String $string Arabic string in unknown format
  110. *
  111. * @return Array Character set as key and string association ratio as value
  112. * @author Khaled Al-Sham'aa <khaled@ar-php.org>
  113. */
  114. public function guess($string)
  115. {
  116. // The most frequent Arabic letters are Alef, Lam, and Yeh
  117. $charset['windows-1256'] = substr_count($string, chr(199));
  118. $charset['windows-1256'] += substr_count($string, chr(225));
  119. $charset['windows-1256'] += substr_count($string, chr(237));
  120. $charset['iso-8859-6'] = substr_count($string, chr(199));
  121. $charset['iso-8859-6'] += substr_count($string, chr(228));
  122. $charset['iso-8859-6'] += substr_count($string, chr(234));
  123. $charset['utf-8'] = substr_count($string, chr(216).chr(167));
  124. $charset['utf-8'] += substr_count($string, chr(217).chr(132));
  125. $charset['utf-8'] += substr_count($string, chr(217).chr(138));
  126. $total = $charset['windows-1256'] +
  127. $charset['iso-8859-6'] +
  128. $charset['utf-8'];
  129. $charset['windows-1256'] = round($charset['windows-1256'] * 100 / $total);
  130. $charset['iso-8859-6'] = round($charset['iso-8859-6'] * 100 / $total);
  131. $charset['utf-8'] = round($charset['utf-8'] * 100 / $total);
  132. return $charset;
  133. }
  134. /**
  135. * Find the most possible character set for given Arabic string in unknown
  136. * format
  137. *
  138. * @param String $string Arabic string in unknown format
  139. *
  140. * @return String The most possible character set for given Arabic string in
  141. * unknown format[utf-8|windows-1256|iso-8859-6]
  142. * @author Khaled Al-Sham'aa <khaled@ar-php.org>
  143. */
  144. public function getCharset($string)
  145. {
  146. if (preg_match('/<meta .* charset=([^\"]+)".*>/sim', $string, $matches)) {
  147. $value = $matches[1];
  148. } else {
  149. $charset = $this->guess($string);
  150. arsort($charset);
  151. $value = key($charset);
  152. }
  153. return $value;
  154. }
  155. }