/extensions/spellcheck/src/mozEnglishWordUtils.cpp

http://github.com/zpao/v8monkey · C++ · 319 lines · 225 code · 39 blank · 55 comment · 47 complexity · e83e55ac79ae7ec3f63c38f893fdd0d2 MD5 · raw file

  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Mozilla Spellchecker Component.
  16. *
  17. * The Initial Developer of the Original Code is
  18. * David Einstein.
  19. * Portions created by the Initial Developer are Copyright (C) 2001
  20. * the Initial Developer. All Rights Reserved.
  21. *
  22. * Contributor(s): David Einstein Deinst@world.std.com
  23. *
  24. * Alternatively, the contents of this file may be used under the terms of
  25. * either the GNU General Public License Version 2 or later (the "GPL"), or
  26. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27. * in which case the provisions of the GPL or the LGPL are applicable instead
  28. * of those above. If you wish to allow use of your version of this file only
  29. * under the terms of either the GPL or the LGPL, and not to allow others to
  30. * use your version of this file under the terms of the MPL, indicate your
  31. * decision by deleting the provisions above and replace them with the notice
  32. * and other provisions required by the GPL or the LGPL. If you do not delete
  33. * the provisions above, a recipient may use your version of this file under
  34. * the terms of any one of the MPL, the GPL or the LGPL.
  35. *
  36. * ***** END LICENSE BLOCK ***** */
  37. #include "mozEnglishWordUtils.h"
  38. #include "nsICharsetAlias.h"
  39. #include "nsReadableUtils.h"
  40. #include "nsIServiceManager.h"
  41. #include "nsUnicharUtils.h"
  42. #include "nsUnicharUtilCIID.h"
  43. #include "nsCRT.h"
  44. NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils)
  45. NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils)
  46. NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils)
  47. NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil)
  48. NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil)
  49. NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils)
  50. NS_INTERFACE_MAP_END
  51. NS_IMPL_CYCLE_COLLECTION_2(mozEnglishWordUtils,
  52. mCategories,
  53. mURLDetector)
  54. mozEnglishWordUtils::mozEnglishWordUtils()
  55. {
  56. mLanguage.AssignLiteral("en");
  57. nsresult rv;
  58. mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv);
  59. mCategories = do_GetService(NS_UNICHARCATEGORY_CONTRACTID);
  60. }
  61. mozEnglishWordUtils::~mozEnglishWordUtils()
  62. {
  63. }
  64. /* attribute wstring language; */
  65. NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(PRUnichar * *aLanguage)
  66. {
  67. nsresult rv = NS_OK;
  68. NS_ENSURE_ARG_POINTER(aLanguage);
  69. *aLanguage = ToNewUnicode(mLanguage);
  70. if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY;
  71. return rv;
  72. }
  73. /* void GetRootForm (in wstring aWord, in PRUint32 type, [array, size_is (count)] out wstring words, out PRUint32 count); */
  74. // return the possible root forms of aWord.
  75. NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const PRUnichar *aWord, PRUint32 type, PRUnichar ***words, PRUint32 *count)
  76. {
  77. nsAutoString word(aWord);
  78. PRUnichar **tmpPtr;
  79. PRInt32 length = word.Length();
  80. *count = 0;
  81. mozEnglishWordUtils::myspCapitalization ct = captype(word);
  82. switch (ct)
  83. {
  84. case HuhCap:
  85. case NoCap:
  86. tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *));
  87. if (!tmpPtr)
  88. return NS_ERROR_OUT_OF_MEMORY;
  89. tmpPtr[0] = ToNewUnicode(word);
  90. if (!tmpPtr[0]) {
  91. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
  92. return NS_ERROR_OUT_OF_MEMORY;
  93. }
  94. *words = tmpPtr;
  95. *count = 1;
  96. break;
  97. case AllCap:
  98. tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *) * 3);
  99. if (!tmpPtr)
  100. return NS_ERROR_OUT_OF_MEMORY;
  101. tmpPtr[0] = ToNewUnicode(word);
  102. if (!tmpPtr[0]) {
  103. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
  104. return NS_ERROR_OUT_OF_MEMORY;
  105. }
  106. ToLowerCase(tmpPtr[0], tmpPtr[0], length);
  107. tmpPtr[1] = ToNewUnicode(word);
  108. if (!tmpPtr[1]) {
  109. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
  110. return NS_ERROR_OUT_OF_MEMORY;
  111. }
  112. ToLowerCase(tmpPtr[1], tmpPtr[1], length);
  113. ToUpperCase(tmpPtr[1], tmpPtr[1], 1);
  114. tmpPtr[2] = ToNewUnicode(word);
  115. if (!tmpPtr[2]) {
  116. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr);
  117. return NS_ERROR_OUT_OF_MEMORY;
  118. }
  119. *words = tmpPtr;
  120. *count = 3;
  121. break;
  122. case InitCap:
  123. tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *) * 2);
  124. if (!tmpPtr)
  125. return NS_ERROR_OUT_OF_MEMORY;
  126. tmpPtr[0] = ToNewUnicode(word);
  127. if (!tmpPtr[0]) {
  128. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
  129. return NS_ERROR_OUT_OF_MEMORY;
  130. }
  131. ToLowerCase(tmpPtr[0], tmpPtr[0], length);
  132. tmpPtr[1] = ToNewUnicode(word);
  133. if (!tmpPtr[1]) {
  134. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
  135. return NS_ERROR_OUT_OF_MEMORY;
  136. }
  137. *words = tmpPtr;
  138. *count = 2;
  139. break;
  140. default:
  141. return NS_ERROR_FAILURE; // should never get here;
  142. }
  143. return NS_OK;
  144. }
  145. // This needs vast improvement
  146. bool mozEnglishWordUtils::ucIsAlpha(PRUnichar aChar)
  147. {
  148. // XXX we have to fix callers to handle the full Unicode range
  149. return nsIUGenCategory::kLetter == mCategories->Get(PRUint32(aChar));
  150. }
  151. /* void FindNextWord (in wstring word, in PRUint32 length, in PRUint32 offset, out PRUint32 begin, out PRUint32 end); */
  152. NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const PRUnichar *word, PRUint32 length, PRUint32 offset, PRInt32 *begin, PRInt32 *end)
  153. {
  154. const PRUnichar *p = word + offset;
  155. const PRUnichar *endbuf = word + length;
  156. const PRUnichar *startWord=p;
  157. if(p<endbuf){
  158. // XXX These loops should be modified to handle non-BMP characters.
  159. // if previous character is a word character, need to advance out of the word
  160. if (offset > 0 && ucIsAlpha(*(p-1))) {
  161. while (p < endbuf && ucIsAlpha(*p))
  162. p++;
  163. }
  164. while((p < endbuf) && (!ucIsAlpha(*p)))
  165. {
  166. p++;
  167. }
  168. startWord=p;
  169. while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\'')))
  170. {
  171. p++;
  172. }
  173. // we could be trying to break down a url, we don't want to break a url into parts,
  174. // instead we want to find out if it really is a url and if so, skip it, advancing startWord
  175. // to a point after the url.
  176. // before we spend more time looking to see if the word is a url, look for a url identifer
  177. // and make sure that identifer isn't the last character in the word fragment.
  178. if ( (*p == ':' || *p == '@' || *p == '.') && p < endbuf - 1) {
  179. // ok, we have a possible url...do more research to find out if we really have one
  180. // and determine the length of the url so we can skip over it.
  181. if (mURLDetector)
  182. {
  183. PRInt32 startPos = -1;
  184. PRInt32 endPos = -1;
  185. mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos);
  186. // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again
  187. if (startPos != -1 && endPos != -1) {
  188. startWord = p + endPos + 1; // skip over the url
  189. p = startWord; // reset p
  190. // now recursively call FindNextWord to search for the next word now that we have skipped the url
  191. return FindNextWord(word, length, startWord - word, begin, end);
  192. }
  193. }
  194. }
  195. while((p > startWord)&&(*(p-1) == '\'')){ // trim trailing apostrophes
  196. p--;
  197. }
  198. }
  199. else{
  200. startWord = endbuf;
  201. }
  202. if(startWord == endbuf){
  203. *begin = -1;
  204. *end = -1;
  205. }
  206. else{
  207. *begin = startWord-word;
  208. *end = p-word;
  209. }
  210. return NS_OK;
  211. }
  212. mozEnglishWordUtils::myspCapitalization
  213. mozEnglishWordUtils::captype(const nsString &word)
  214. {
  215. PRUnichar* lword=ToNewUnicode(word);
  216. ToUpperCase(lword,lword,word.Length());
  217. if(word.Equals(lword)){
  218. nsMemory::Free(lword);
  219. return AllCap;
  220. }
  221. ToLowerCase(lword,lword,word.Length());
  222. if(word.Equals(lword)){
  223. nsMemory::Free(lword);
  224. return NoCap;
  225. }
  226. PRInt32 length=word.Length();
  227. if(Substring(word,1,length-1).Equals(lword+1)){
  228. nsMemory::Free(lword);
  229. return InitCap;
  230. }
  231. nsMemory::Free(lword);
  232. return HuhCap;
  233. }
  234. // Convert the list of words in iwords to the same capitalization aWord and
  235. // return them in owords.
  236. NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const PRUnichar *aWord, const PRUnichar **iwords, PRUint32 icount, PRUnichar ***owords, PRUint32 *ocount)
  237. {
  238. nsAutoString word(aWord);
  239. nsresult rv = NS_OK;
  240. PRInt32 length;
  241. PRUnichar **tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *)*icount);
  242. if (!tmpPtr)
  243. return NS_ERROR_OUT_OF_MEMORY;
  244. mozEnglishWordUtils::myspCapitalization ct = captype(word);
  245. for(PRUint32 i = 0; i < icount; ++i) {
  246. length = nsCRT::strlen(iwords[i]);
  247. tmpPtr[i] = (PRUnichar *) nsMemory::Alloc(sizeof(PRUnichar) * (length + 1));
  248. if (NS_UNLIKELY(!tmpPtr[i])) {
  249. NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr);
  250. return NS_ERROR_OUT_OF_MEMORY;
  251. }
  252. memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(PRUnichar));
  253. nsAutoString capTest(tmpPtr[i]);
  254. mozEnglishWordUtils::myspCapitalization newCt=captype(capTest);
  255. if(newCt == NoCap){
  256. switch(ct)
  257. {
  258. case HuhCap:
  259. case NoCap:
  260. break;
  261. case AllCap:
  262. ToUpperCase(tmpPtr[i],tmpPtr[i],length);
  263. rv = NS_OK;
  264. break;
  265. case InitCap:
  266. ToUpperCase(tmpPtr[i],tmpPtr[i],1);
  267. rv = NS_OK;
  268. break;
  269. default:
  270. rv = NS_ERROR_FAILURE; // should never get here;
  271. break;
  272. }
  273. }
  274. }
  275. if (NS_SUCCEEDED(rv)){
  276. *owords = tmpPtr;
  277. *ocount = icount;
  278. }
  279. return rv;
  280. }