PageRenderTime 72ms CodeModel.GetById 36ms RepoModel.GetById 0ms app.codeStats 0ms

/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp

https://bitbucket.org/zenoalbisser/webkit
C++ | 388 lines | 294 code | 57 blank | 37 comment | 81 complexity | 2bb13b800dc65fafa444d62f6fcc0c23 MD5 | raw file
  1. /*
  2. * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
  3. * Copyright (C) 2007 Apple Inc. All rights reserved.
  4. * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
  5. * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
  6. * Copyright (C) 2010 Igalia S.L.
  7. *
  8. * This library is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Library General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2 of the License, or (at your option) any later version.
  12. *
  13. * This library is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Library General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Library General Public License
  19. * along with this library; see the file COPYING.LIB. If not, write to
  20. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  21. * Boston, MA 02110-1301, USA.
  22. *
  23. */
  24. #include "config.h"
  25. #include "TextBreakIterator.h"
  26. #include "GOwnPtr.h"
  27. #include <pango/pango.h>
  28. using namespace std;
  29. #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
  30. namespace WebCore {
  31. class CharacterIterator {
  32. public:
  33. bool setText(const UChar* string, int length);
  34. const gchar* getText() { return m_utf8.get(); }
  35. int getLength() { return m_length; }
  36. glong getSize() { return m_size; }
  37. void setIndex(int index);
  38. int getIndex() { return m_index; }
  39. void setUTF16Index(int index);
  40. int getUTF16Index() { return m_utf16Index; }
  41. int getUTF16Length() { return m_utf16Length; }
  42. int first();
  43. int last();
  44. int next();
  45. int previous();
  46. private:
  47. int characterSize(int index);
  48. GOwnPtr<char> m_utf8;
  49. int m_length;
  50. long m_size;
  51. int m_index;
  52. int m_utf16Index;
  53. int m_utf16Length;
  54. };
  55. int CharacterIterator::characterSize(int index)
  56. {
  57. if (index == m_length || index < 0)
  58. return 0;
  59. if (m_length == m_utf16Length)
  60. return 1;
  61. gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index);
  62. gunichar character = g_utf8_get_char(indexPtr);
  63. return UTF8_IS_SURROGATE(character) ? 2 : 1;
  64. }
  65. bool CharacterIterator::setText(const UChar* string, int length)
  66. {
  67. long utf8Size = 0;
  68. m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0));
  69. if (!utf8Size)
  70. return false;
  71. m_utf16Length = length;
  72. m_length = g_utf8_strlen(m_utf8.get(), utf8Size);
  73. m_size = utf8Size;
  74. m_index = 0;
  75. m_utf16Index = 0;
  76. return true;
  77. }
  78. void CharacterIterator::setIndex(int index)
  79. {
  80. if (index == m_index)
  81. return;
  82. if (index <= 0)
  83. m_index = m_utf16Index = 0;
  84. else if (index >= m_length) {
  85. m_index = m_length;
  86. m_utf16Index = m_utf16Length;
  87. } else if (m_length == m_utf16Length)
  88. m_index = m_utf16Index = index;
  89. else {
  90. m_index = index;
  91. int utf16Index = 0;
  92. int utf8Index = 0;
  93. while (utf8Index < index) {
  94. utf16Index += characterSize(utf8Index);
  95. utf8Index++;
  96. }
  97. m_utf16Index = utf16Index;
  98. }
  99. }
  100. void CharacterIterator::setUTF16Index(int index)
  101. {
  102. if (index == m_utf16Index)
  103. return;
  104. if (index <= 0)
  105. m_utf16Index = m_index = 0;
  106. else if (index >= m_utf16Length) {
  107. m_utf16Index = m_utf16Length;
  108. m_index = m_length;
  109. } else if (m_length == m_utf16Length)
  110. m_utf16Index = m_index = index;
  111. else {
  112. m_utf16Index = index;
  113. int utf16Index = 0;
  114. int utf8Index = 0;
  115. while (utf16Index < index) {
  116. utf16Index += characterSize(utf8Index);
  117. utf8Index++;
  118. }
  119. m_index = utf8Index;
  120. }
  121. }
  122. int CharacterIterator::first()
  123. {
  124. m_index = m_utf16Index = 0;
  125. return m_index;
  126. }
  127. int CharacterIterator::last()
  128. {
  129. m_index = m_length;
  130. m_utf16Index = m_utf16Length;
  131. return m_index;
  132. }
  133. int CharacterIterator::next()
  134. {
  135. int next = m_index + 1;
  136. if (next <= m_length) {
  137. m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length);
  138. m_index = next;
  139. } else {
  140. m_index = TextBreakDone;
  141. m_utf16Index = TextBreakDone;
  142. }
  143. return m_index;
  144. }
  145. int CharacterIterator::previous()
  146. {
  147. int previous = m_index - 1;
  148. if (previous >= 0) {
  149. m_utf16Index = max(m_utf16Index - characterSize(previous), 0);
  150. m_index = previous;
  151. } else {
  152. m_index = TextBreakDone;
  153. m_utf16Index = TextBreakDone;
  154. }
  155. return m_index;
  156. }
  157. enum UBreakIteratorType {
  158. UBRK_CHARACTER,
  159. UBRK_WORD,
  160. UBRK_LINE,
  161. UBRK_SENTENCE
  162. };
  163. class TextBreakIterator {
  164. public:
  165. UBreakIteratorType m_type;
  166. PangoLogAttr* m_logAttrs;
  167. CharacterIterator m_charIterator;
  168. };
  169. static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
  170. UBreakIteratorType type, const UChar* string, int length)
  171. {
  172. if (!string)
  173. return 0;
  174. if (!createdIterator) {
  175. iterator = new TextBreakIterator();
  176. createdIterator = true;
  177. }
  178. if (!iterator)
  179. return 0;
  180. if (!iterator->m_charIterator.setText(string, length))
  181. return 0;
  182. int charLength = iterator->m_charIterator.getLength();
  183. iterator->m_type = type;
  184. if (createdIterator)
  185. g_free(iterator->m_logAttrs);
  186. iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1);
  187. pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(),
  188. -1, 0, iterator->m_logAttrs, charLength + 1);
  189. return iterator;
  190. }
  191. TextBreakIterator* characterBreakIterator(const UChar* string, int length)
  192. {
  193. static bool createdCharacterBreakIterator = false;
  194. static TextBreakIterator* staticCharacterBreakIterator;
  195. return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
  196. }
  197. TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
  198. {
  199. // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version.
  200. return characterBreakIterator(string, length);
  201. }
  202. TextBreakIterator* wordBreakIterator(const UChar* string, int length)
  203. {
  204. static bool createdWordBreakIterator = false;
  205. static TextBreakIterator* staticWordBreakIterator;
  206. return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length);
  207. }
  208. static bool createdLineBreakIterator = false;
  209. static TextBreakIterator* staticLineBreakIterator;
  210. TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString&)
  211. {
  212. TextBreakIterator* lineBreakIterator = 0;
  213. if (!createdLineBreakIterator || staticLineBreakIterator) {
  214. setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
  215. swap(staticLineBreakIterator, lineBreakIterator);
  216. }
  217. if (!lineBreakIterator) {
  218. bool createdNewLineBreakIterator = false;
  219. setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
  220. }
  221. return lineBreakIterator;
  222. }
  223. void releaseLineBreakIterator(TextBreakIterator* iterator)
  224. {
  225. ASSERT(createdLineBreakIterator);
  226. ASSERT(iterator);
  227. if (!staticLineBreakIterator)
  228. staticLineBreakIterator = iterator;
  229. else
  230. delete iterator;
  231. }
  232. TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
  233. {
  234. static bool createdSentenceBreakIterator = false;
  235. static TextBreakIterator* staticSentenceBreakIterator;
  236. return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
  237. }
  238. int textBreakFirst(TextBreakIterator* iterator)
  239. {
  240. iterator->m_charIterator.first();
  241. return iterator->m_charIterator.getUTF16Index();
  242. }
  243. int textBreakLast(TextBreakIterator* iterator)
  244. {
  245. // TextBreakLast is not meant to find just any break according to bi->m_type
  246. // but really the one near the last character.
  247. // (cmp ICU documentation for ubrk_first and ubrk_last)
  248. // From ICU docs for ubrk_last:
  249. // "Determine the index immediately beyond the last character in the text being scanned."
  250. // So we should advance or traverse back based on bi->m_logAttrs cursor positions.
  251. // If last character position in the original string is a whitespace,
  252. // traverse to the left until the first non-white character position is found
  253. // and return the position of the first white-space char after this one.
  254. // Otherwise return m_length, as "the first character beyond the last" is outside our string.
  255. bool whiteSpaceAtTheEnd = true;
  256. int nextWhiteSpacePos = iterator->m_charIterator.getLength();
  257. int pos = iterator->m_charIterator.last();
  258. while (pos >= 0 && whiteSpaceAtTheEnd) {
  259. if (iterator->m_logAttrs[pos].is_cursor_position) {
  260. if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white)
  261. nextWhiteSpacePos = pos;
  262. }
  263. pos = iterator->m_charIterator.previous();
  264. }
  265. iterator->m_charIterator.setIndex(nextWhiteSpacePos);
  266. return iterator->m_charIterator.getUTF16Index();
  267. }
  268. int textBreakNext(TextBreakIterator* iterator)
  269. {
  270. while (iterator->m_charIterator.next() != TextBreakDone) {
  271. int index = iterator->m_charIterator.getIndex();
  272. // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €,
  273. // are not marked as word_start & word_end as opposed to the way ICU does it.
  274. // This leads to - for example - different word selection behaviour when right clicking.
  275. if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
  276. || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
  277. || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
  278. || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
  279. break;
  280. }
  281. }
  282. return iterator->m_charIterator.getUTF16Index();
  283. }
  284. int textBreakPrevious(TextBreakIterator* iterator)
  285. {
  286. while (iterator->m_charIterator.previous() != TextBreakDone) {
  287. int index = iterator->m_charIterator.getIndex();
  288. if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
  289. || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
  290. || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
  291. || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
  292. break;
  293. }
  294. }
  295. return iterator->m_charIterator.getUTF16Index();
  296. }
  297. int textBreakPreceding(TextBreakIterator* iterator, int offset)
  298. {
  299. if (offset > iterator->m_charIterator.getUTF16Length())
  300. return TextBreakDone;
  301. if (offset < 0)
  302. return 0;
  303. iterator->m_charIterator.setUTF16Index(offset);
  304. return textBreakPrevious(iterator);
  305. }
  306. int textBreakFollowing(TextBreakIterator* iterator, int offset)
  307. {
  308. if (offset > iterator->m_charIterator.getUTF16Length())
  309. return TextBreakDone;
  310. if (offset < 0)
  311. return 0;
  312. iterator->m_charIterator.setUTF16Index(offset);
  313. return textBreakNext(iterator);
  314. }
  315. int textBreakCurrent(TextBreakIterator* iterator)
  316. {
  317. return iterator->m_charIterator.getUTF16Index();
  318. }
  319. bool isTextBreak(TextBreakIterator* iterator, int offset)
  320. {
  321. if (!offset)
  322. return true;
  323. if (offset > iterator->m_charIterator.getUTF16Length())
  324. return false;
  325. iterator->m_charIterator.setUTF16Index(offset);
  326. int index = iterator->m_charIterator.getIndex();
  327. iterator->m_charIterator.previous();
  328. textBreakNext(iterator);
  329. return iterator->m_charIterator.getIndex() == index;
  330. }
  331. }