PageRenderTime 33ms CodeModel.GetById 10ms app.highlight 18ms RepoModel.GetById 1ms app.codeStats 0ms

/extensions/spellcheck/src/mozEnglishWordUtils.cpp

http://github.com/zpao/v8monkey
C++ | 319 lines | 225 code | 39 blank | 55 comment | 43 complexity | e83e55ac79ae7ec3f63c38f893fdd0d2 MD5 | raw file
  1/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2/* ***** BEGIN LICENSE BLOCK *****
  3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4 *
  5 * The contents of this file are subject to the Mozilla Public License Version
  6 * 1.1 (the "License"); you may not use this file except in compliance with
  7 * the License. You may obtain a copy of the License at
  8 * http://www.mozilla.org/MPL/
  9 *
 10 * Software distributed under the License is distributed on an "AS IS" basis,
 11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 12 * for the specific language governing rights and limitations under the
 13 * License.
 14 *
 15 * The Original Code is Mozilla Spellchecker Component.
 16 *
 17 * The Initial Developer of the Original Code is
 18 * David Einstein.
 19 * Portions created by the Initial Developer are Copyright (C) 2001
 20 * the Initial Developer. All Rights Reserved.
 21 *
 22 * Contributor(s): David Einstein Deinst@world.std.com
 23 *
 24 * Alternatively, the contents of this file may be used under the terms of
 25 * either the GNU General Public License Version 2 or later (the "GPL"), or
 26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 27 * in which case the provisions of the GPL or the LGPL are applicable instead
 28 * of those above. If you wish to allow use of your version of this file only
 29 * under the terms of either the GPL or the LGPL, and not to allow others to
 30 * use your version of this file under the terms of the MPL, indicate your
 31 * decision by deleting the provisions above and replace them with the notice
 32 * and other provisions required by the GPL or the LGPL. If you do not delete
 33 * the provisions above, a recipient may use your version of this file under
 34 * the terms of any one of the MPL, the GPL or the LGPL.
 35 *
 36 * ***** END LICENSE BLOCK ***** */
 37
 38#include "mozEnglishWordUtils.h"
 39#include "nsICharsetAlias.h"
 40#include "nsReadableUtils.h"
 41#include "nsIServiceManager.h"
 42#include "nsUnicharUtils.h"
 43#include "nsUnicharUtilCIID.h"
 44#include "nsCRT.h"
 45
 46NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils)
 47NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils)
 48
 49NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils)
 50  NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil)
 51  NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil)
 52  NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils)
 53NS_INTERFACE_MAP_END
 54
 55NS_IMPL_CYCLE_COLLECTION_2(mozEnglishWordUtils,
 56                           mCategories,
 57                           mURLDetector)
 58
 59mozEnglishWordUtils::mozEnglishWordUtils()
 60{
 61  mLanguage.AssignLiteral("en");
 62
 63  nsresult rv;
 64  mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv);
 65  mCategories = do_GetService(NS_UNICHARCATEGORY_CONTRACTID);
 66}
 67
 68mozEnglishWordUtils::~mozEnglishWordUtils()
 69{
 70}
 71
 72/* attribute wstring language; */
 73NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(PRUnichar * *aLanguage)
 74{
 75  nsresult rv = NS_OK;
 76  NS_ENSURE_ARG_POINTER(aLanguage);
 77
 78  *aLanguage = ToNewUnicode(mLanguage);
 79  if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY;
 80  return rv;
 81 }
 82
 83/* void GetRootForm (in wstring aWord, in PRUint32 type, [array, size_is (count)] out wstring words, out PRUint32 count); */
 84// return the possible root forms of aWord.
 85NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const PRUnichar *aWord, PRUint32 type, PRUnichar ***words, PRUint32 *count)
 86{
 87  nsAutoString word(aWord);
 88  PRUnichar **tmpPtr;
 89  PRInt32 length = word.Length();
 90
 91  *count = 0;
 92
 93  mozEnglishWordUtils::myspCapitalization ct = captype(word);
 94  switch (ct)
 95    {
 96    case HuhCap:
 97    case NoCap: 
 98      tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *));
 99      if (!tmpPtr)
100        return NS_ERROR_OUT_OF_MEMORY;
101      tmpPtr[0] = ToNewUnicode(word);
102      if (!tmpPtr[0]) {
103        NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
104        return NS_ERROR_OUT_OF_MEMORY;
105      }
106      *words = tmpPtr;
107      *count = 1;
108      break;
109    
110
111    case AllCap:
112      tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *) * 3);
113      if (!tmpPtr)
114        return NS_ERROR_OUT_OF_MEMORY;
115      tmpPtr[0] = ToNewUnicode(word);
116      if (!tmpPtr[0]) {
117        NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
118        return NS_ERROR_OUT_OF_MEMORY;
119      }
120      ToLowerCase(tmpPtr[0], tmpPtr[0], length);
121
122      tmpPtr[1] = ToNewUnicode(word);
123      if (!tmpPtr[1]) {
124        NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
125        return NS_ERROR_OUT_OF_MEMORY;
126      }
127      ToLowerCase(tmpPtr[1], tmpPtr[1], length);
128      ToUpperCase(tmpPtr[1], tmpPtr[1], 1);
129
130      tmpPtr[2] = ToNewUnicode(word);
131      if (!tmpPtr[2]) {
132        NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr);
133        return NS_ERROR_OUT_OF_MEMORY;
134      }
135
136      *words = tmpPtr;
137      *count = 3;
138      break;
139 
140    case InitCap:  
141      tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *) * 2);
142      if (!tmpPtr)
143        return NS_ERROR_OUT_OF_MEMORY;
144
145      tmpPtr[0] = ToNewUnicode(word);
146      if (!tmpPtr[0]) {
147        NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
148        return NS_ERROR_OUT_OF_MEMORY;
149      }
150      ToLowerCase(tmpPtr[0], tmpPtr[0], length);
151
152      tmpPtr[1] = ToNewUnicode(word);
153      if (!tmpPtr[1]) {
154        NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
155        return NS_ERROR_OUT_OF_MEMORY;
156      }
157
158      *words = tmpPtr;
159      *count = 2;
160      break;
161    default:
162      return NS_ERROR_FAILURE; // should never get here;
163    }
164  return NS_OK;
165}
166
167// This needs vast improvement
168bool mozEnglishWordUtils::ucIsAlpha(PRUnichar aChar)
169{
170  // XXX we have to fix callers to handle the full Unicode range
171  return nsIUGenCategory::kLetter == mCategories->Get(PRUint32(aChar));
172}
173
174/* void FindNextWord (in wstring word, in PRUint32 length, in PRUint32 offset, out PRUint32 begin, out PRUint32 end); */
175NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const PRUnichar *word, PRUint32 length, PRUint32 offset, PRInt32 *begin, PRInt32 *end)
176{
177  const PRUnichar *p = word + offset;
178  const PRUnichar *endbuf = word + length;
179  const PRUnichar *startWord=p;
180  if(p<endbuf){
181    // XXX These loops should be modified to handle non-BMP characters.
182    // if previous character is a word character, need to advance out of the word
183    if (offset > 0 && ucIsAlpha(*(p-1))) {
184      while (p < endbuf && ucIsAlpha(*p))
185        p++;
186    }
187    while((p < endbuf) && (!ucIsAlpha(*p)))
188      {
189        p++;
190      }
191    startWord=p;
192    while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\'')))
193      { 
194        p++;
195      }
196    
197    // we could be trying to break down a url, we don't want to break a url into parts,
198    // instead we want to find out if it really is a url and if so, skip it, advancing startWord 
199    // to a point after the url.
200
201    // before we spend more time looking to see if the word is a url, look for a url identifer
202    // and make sure that identifer isn't the last character in the word fragment.
203    if ( (*p == ':' || *p == '@' || *p == '.') &&  p < endbuf - 1) {
204
205        // ok, we have a possible url...do more research to find out if we really have one
206        // and determine the length of the url so we can skip over it.
207       
208        if (mURLDetector)
209        {
210          PRInt32 startPos = -1;
211          PRInt32 endPos = -1;        
212
213          mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos);
214
215          // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again
216          if (startPos != -1 && endPos != -1) { 
217            startWord = p + endPos + 1; // skip over the url
218            p = startWord; // reset p
219
220            // now recursively call FindNextWord to search for the next word now that we have skipped the url
221            return FindNextWord(word, length, startWord - word, begin, end);
222          }
223        }
224    }
225
226    while((p > startWord)&&(*(p-1) == '\'')){  // trim trailing apostrophes
227      p--;
228    }
229  }
230  else{
231    startWord = endbuf;
232  }
233  if(startWord == endbuf){
234    *begin = -1;
235    *end = -1;
236  }
237  else{
238    *begin = startWord-word;
239    *end = p-word;
240  }
241  return NS_OK;
242}
243
244mozEnglishWordUtils::myspCapitalization 
245mozEnglishWordUtils::captype(const nsString &word)
246{
247  PRUnichar* lword=ToNewUnicode(word);  
248  ToUpperCase(lword,lword,word.Length());
249  if(word.Equals(lword)){
250    nsMemory::Free(lword);
251    return AllCap;
252  }
253
254  ToLowerCase(lword,lword,word.Length());
255  if(word.Equals(lword)){
256    nsMemory::Free(lword);
257    return NoCap;
258  }
259  PRInt32 length=word.Length();
260  if(Substring(word,1,length-1).Equals(lword+1)){
261    nsMemory::Free(lword);
262    return InitCap;
263  }
264  nsMemory::Free(lword);
265  return HuhCap;
266}
267
268// Convert the list of words in iwords to the same capitalization aWord and 
269// return them in owords.
270NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const PRUnichar *aWord, const PRUnichar **iwords, PRUint32 icount, PRUnichar ***owords, PRUint32 *ocount)
271{
272  nsAutoString word(aWord);
273  nsresult rv = NS_OK;
274
275  PRInt32 length;
276  PRUnichar **tmpPtr  = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *)*icount);
277  if (!tmpPtr)
278    return NS_ERROR_OUT_OF_MEMORY;
279
280  mozEnglishWordUtils::myspCapitalization ct = captype(word);
281  for(PRUint32 i = 0; i < icount; ++i) {
282    length = nsCRT::strlen(iwords[i]);
283    tmpPtr[i] = (PRUnichar *) nsMemory::Alloc(sizeof(PRUnichar) * (length + 1));
284    if (NS_UNLIKELY(!tmpPtr[i])) {
285      NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr);
286      return NS_ERROR_OUT_OF_MEMORY;
287    }
288    memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(PRUnichar));
289
290    nsAutoString capTest(tmpPtr[i]);
291    mozEnglishWordUtils::myspCapitalization newCt=captype(capTest);
292    if(newCt == NoCap){
293      switch(ct) 
294        {
295        case HuhCap:
296        case NoCap:
297          break;
298        case AllCap:
299          ToUpperCase(tmpPtr[i],tmpPtr[i],length);
300          rv = NS_OK;
301          break;
302        case InitCap:  
303          ToUpperCase(tmpPtr[i],tmpPtr[i],1);
304          rv = NS_OK;
305          break;
306        default:
307          rv = NS_ERROR_FAILURE; // should never get here;
308          break;
309
310        }
311    }
312  }
313  if (NS_SUCCEEDED(rv)){
314    *owords = tmpPtr;
315    *ocount = icount;
316  }
317  return rv;
318}
319