PageRenderTime 28ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/third_party/WebKit/Source/platform/text/Character.cpp

https://gitlab.com/0072016/Facebook-SDK-
C++ | 369 lines | 255 code | 45 blank | 69 comment | 80 complexity | 1600391d1c3d8b828f319923533bd0b0 MD5 | raw file
  1. /*
  2. * Copyright (C) 2014 Google Inc. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. */
  30. #include "platform/text/Character.h"
  31. #include "wtf/StdLibExtras.h"
  32. #include "wtf/text/StringBuilder.h"
  33. #include <algorithm>
  34. #include <unicode/uobject.h>
  35. #include <unicode/uscript.h>
  36. #if defined(USING_SYSTEM_ICU)
  37. #include "platform/fonts/CharacterPropertyDataGenerator.h"
  38. #include <unicode/uniset.h>
  39. #else
  40. #define MUTEX_H // Prevent compile failure of utrie2.h on Windows
  41. #include <utrie2.h>
  42. #endif
  43. using namespace WTF;
  44. using namespace Unicode;
  45. namespace blink {
  46. #if defined(USING_SYSTEM_ICU)
  47. static icu::UnicodeSet* createUnicodeSet(
  48. const UChar32* characters, size_t charactersCount,
  49. const UChar32* ranges, size_t rangesCount)
  50. {
  51. icu::UnicodeSet* unicodeSet = new icu::UnicodeSet();
  52. for (size_t i = 0; i < charactersCount; i++)
  53. unicodeSet->add(characters[i]);
  54. for (size_t i = 0; i < rangesCount; i += 2)
  55. unicodeSet->add(ranges[i], ranges[i + 1]);
  56. unicodeSet->freeze();
  57. return unicodeSet;
  58. }
  59. #define CREATE_UNICODE_SET(name) \
  60. createUnicodeSet( \
  61. name##Array, WTF_ARRAY_LENGTH(name##Array), \
  62. name##Ranges, WTF_ARRAY_LENGTH(name##Ranges))
  63. #define RETURN_HAS_PROPERTY(c, name) \
  64. static icu::UnicodeSet* unicodeSet = nullptr; \
  65. if (!unicodeSet) \
  66. unicodeSet = CREATE_UNICODE_SET(name); \
  67. return unicodeSet->contains(c);
  68. #else
  69. // Freezed trie tree, see CharacterDataGenerator.cpp.
  70. extern int32_t serializedCharacterDataSize;
  71. extern uint8_t serializedCharacterData[];
  72. static UTrie2* createTrie()
  73. {
  74. // Create a Trie from the value array.
  75. UErrorCode error = U_ZERO_ERROR;
  76. UTrie2* trie = utrie2_openFromSerialized(
  77. UTrie2ValueBits::UTRIE2_16_VALUE_BITS,
  78. serializedCharacterData, serializedCharacterDataSize,
  79. nullptr, &error);
  80. ASSERT(error == U_ZERO_ERROR);
  81. return trie;
  82. }
  83. static bool hasProperty(UChar32 c, CharacterProperty property)
  84. {
  85. static UTrie2* trie = nullptr;
  86. if (!trie)
  87. trie = createTrie();
  88. return UTRIE2_GET16(trie, c)
  89. & static_cast<CharacterPropertyType>(property);
  90. }
  91. #define RETURN_HAS_PROPERTY(c, name) \
  92. return hasProperty(c, CharacterProperty::name);
  93. #endif
  94. // Takes a flattened list of closed intervals
  95. template <class T, size_t size>
  96. bool valueInIntervalList(const T (&intervalList)[size], const T& value)
  97. {
  98. const T* bound = std::upper_bound(&intervalList[0], &intervalList[size], value);
  99. if ((bound - intervalList) % 2 == 1)
  100. return true;
  101. return bound > intervalList && *(bound - 1) == value;
  102. }
  103. CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len)
  104. {
  105. static const UChar complexCodePathRanges[] = {
  106. // U+02E5 through U+02E9 (Modifier Letters : Tone letters)
  107. 0x2E5, 0x2E9,
  108. // U+0300 through U+036F Combining diacritical marks
  109. 0x300, 0x36F,
  110. // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ...
  111. 0x0591, 0x05BD,
  112. // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha
  113. 0x05BF, 0x05CF,
  114. // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic,
  115. // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
  116. // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar
  117. 0x0600, 0x109F,
  118. // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left
  119. // here if you precompose; Modern Korean will be precomposed as a result of step A)
  120. 0x1100, 0x11FF,
  121. // U+135D through U+135F Ethiopic combining marks
  122. 0x135D, 0x135F,
  123. // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian
  124. 0x1700, 0x18AF,
  125. // U+1900 through U+194F Limbu (Unicode 4.0)
  126. 0x1900, 0x194F,
  127. // U+1980 through U+19DF New Tai Lue
  128. 0x1980, 0x19DF,
  129. // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic
  130. 0x1A00, 0x1CFF,
  131. // U+1DC0 through U+1DFF Comining diacritical mark supplement
  132. 0x1DC0, 0x1DFF,
  133. // U+20D0 through U+20FF Combining marks for symbols
  134. 0x20D0, 0x20FF,
  135. // U+2CEF through U+2CF1 Combining marks for Coptic
  136. 0x2CEF, 0x2CF1,
  137. // U+302A through U+302F Ideographic and Hangul Tone marks
  138. 0x302A, 0x302F,
  139. // Combining Katakana-Hiragana Voiced/Semi-voiced Sound Mark
  140. 0x3099, 0x309A,
  141. // U+A67C through U+A67D Combining marks for old Cyrillic
  142. 0xA67C, 0xA67D,
  143. // U+A6F0 through U+A6F1 Combining mark for Bamum
  144. 0xA6F0, 0xA6F1,
  145. // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended,
  146. // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek
  147. 0xA800, 0xABFF,
  148. // U+D7B0 through U+D7FF Hangul Jamo Ext. B
  149. 0xD7B0, 0xD7FF,
  150. // U+FE00 through U+FE0F Unicode variation selectors
  151. 0xFE00, 0xFE0F,
  152. // U+FE20 through U+FE2F Combining half marks
  153. 0xFE20, 0xFE2F
  154. };
  155. CodePath result = SimplePath;
  156. for (unsigned i = 0; i < len; i++) {
  157. const UChar c = characters[i];
  158. // Shortcut for common case
  159. if (c < 0x2E5)
  160. continue;
  161. // Surrogate pairs
  162. if (c > 0xD7FF && c <= 0xDBFF) {
  163. if (i == len - 1)
  164. continue;
  165. UChar next = characters[++i];
  166. if (!U16_IS_TRAIL(next))
  167. continue;
  168. UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next);
  169. if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols
  170. continue;
  171. if (supplementaryCharacter <= 0x1F1FF)
  172. return ComplexPath;
  173. // Emoji Fitzpatrick modifiers trigger upgrade to complex path for shaping them.
  174. if (supplementaryCharacter < 0x1F3FB)
  175. continue;
  176. if (supplementaryCharacter <= 0x1F3FF)
  177. return ComplexPath;
  178. if (supplementaryCharacter == eyeCharacter)
  179. return ComplexPath;
  180. // Man and Woman Emojies,
  181. // in order to support emoji joiner combinations for family and couple pictographs.
  182. // Compare http://unicode.org/reports/tr51/#Emoji_ZWJ_Sequences
  183. if (supplementaryCharacter < 0x1F468)
  184. continue;
  185. if (supplementaryCharacter <= 0x1F469)
  186. return ComplexPath;
  187. if (supplementaryCharacter == leftSpeechBubbleCharacter)
  188. return ComplexPath;
  189. if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors.
  190. continue;
  191. if (supplementaryCharacter <= 0xE01EF)
  192. return ComplexPath;
  193. // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts
  194. // in plane 1 or higher.
  195. continue;
  196. }
  197. // Search for other Complex cases
  198. if (valueInIntervalList(complexCodePathRanges, c))
  199. return ComplexPath;
  200. }
  201. return result;
  202. }
  203. bool Character::isUprightInMixedVertical(UChar32 character)
  204. {
  205. RETURN_HAS_PROPERTY(character, isUprightInMixedVertical)
  206. }
  207. bool Character::isCJKIdeographOrSymbol(UChar32 c)
  208. {
  209. // Likely common case
  210. if (c < 0x2C7)
  211. return false;
  212. RETURN_HAS_PROPERTY(c, isCJKIdeographOrSymbol)
  213. }
  214. unsigned Character::expansionOpportunityCount(const LChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion, const TextJustify textJustify)
  215. {
  216. unsigned count = 0;
  217. if (textJustify == TextJustifyDistribute) {
  218. isAfterExpansion = true;
  219. return length;
  220. }
  221. if (direction == LTR) {
  222. for (size_t i = 0; i < length; ++i) {
  223. if (treatAsSpace(characters[i])) {
  224. count++;
  225. isAfterExpansion = true;
  226. } else {
  227. isAfterExpansion = false;
  228. }
  229. }
  230. } else {
  231. for (size_t i = length; i > 0; --i) {
  232. if (treatAsSpace(characters[i - 1])) {
  233. count++;
  234. isAfterExpansion = true;
  235. } else {
  236. isAfterExpansion = false;
  237. }
  238. }
  239. }
  240. return count;
  241. }
  242. unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion, const TextJustify textJustify)
  243. {
  244. unsigned count = 0;
  245. if (direction == LTR) {
  246. for (size_t i = 0; i < length; ++i) {
  247. UChar32 character = characters[i];
  248. if (treatAsSpace(character)) {
  249. count++;
  250. isAfterExpansion = true;
  251. continue;
  252. }
  253. if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) {
  254. character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]);
  255. i++;
  256. }
  257. if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) {
  258. if (!isAfterExpansion)
  259. count++;
  260. count++;
  261. isAfterExpansion = true;
  262. continue;
  263. }
  264. isAfterExpansion = false;
  265. }
  266. } else {
  267. for (size_t i = length; i > 0; --i) {
  268. UChar32 character = characters[i - 1];
  269. if (treatAsSpace(character)) {
  270. count++;
  271. isAfterExpansion = true;
  272. continue;
  273. }
  274. if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) {
  275. character = U16_GET_SUPPLEMENTARY(characters[i - 2], character);
  276. i--;
  277. }
  278. if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) {
  279. if (!isAfterExpansion)
  280. count++;
  281. count++;
  282. isAfterExpansion = true;
  283. continue;
  284. }
  285. isAfterExpansion = false;
  286. }
  287. }
  288. return count;
  289. }
  290. bool Character::canReceiveTextEmphasis(UChar32 c)
  291. {
  292. CharCategory category = Unicode::category(c);
  293. if (category & (Separator_Space | Separator_Line | Separator_Paragraph | Other_NotAssigned | Other_Control | Other_Format))
  294. return false;
  295. // Additional word-separator characters listed in CSS Text Level 3 Editor's Draft 3 November 2010.
  296. if (c == ethiopicWordspaceCharacter || c == aegeanWordSeparatorLineCharacter || c == aegeanWordSeparatorDotCharacter
  297. || c == ugariticWordDividerCharacter || c == tibetanMarkIntersyllabicTshegCharacter || c == tibetanMarkDelimiterTshegBstarCharacter)
  298. return false;
  299. return true;
  300. }
  301. template <typename CharacterType>
  302. static inline String normalizeSpacesInternal(const CharacterType* characters, unsigned length)
  303. {
  304. StringBuilder normalized;
  305. normalized.reserveCapacity(length);
  306. for (unsigned i = 0; i < length; ++i)
  307. normalized.append(Character::normalizeSpaces(characters[i]));
  308. return normalized.toString();
  309. }
  310. String Character::normalizeSpaces(const LChar* characters, unsigned length)
  311. {
  312. return normalizeSpacesInternal(characters, length);
  313. }
  314. String Character::normalizeSpaces(const UChar* characters, unsigned length)
  315. {
  316. return normalizeSpacesInternal(characters, length);
  317. }
  318. bool Character::isCommonOrInheritedScript(UChar32 character)
  319. {
  320. UErrorCode status = U_ZERO_ERROR;
  321. UScriptCode script = uscript_getScript(character, &status);
  322. return U_SUCCESS(status) && (script == USCRIPT_COMMON || script == USCRIPT_INHERITED);
  323. }
  324. } // namespace blink