PageRenderTime 44ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/third_party/sqlite/ext/fts3/fts3_icu.c

https://github.com/akesling/chromium
C | 258 lines | 182 code | 37 blank | 39 comment | 14 complexity | 0a7d15577ce8c0a9f26910e06d38bcb8 MD5 | raw file
  1. /*
  2. ** 2007 June 22
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. ** This file implements a tokenizer for fts3 based on the ICU library.
  13. **
  14. ** $Id: fts3_icu.c,v 1.2 2007/10/24 21:52:37 shess Exp $
  15. */
  16. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
  17. #ifdef SQLITE_ENABLE_ICU
  18. #include <assert.h>
  19. #include <string.h>
  20. #include "fts3_tokenizer.h"
  21. #include <unicode/ubrk.h>
  22. #include <unicode/ucol.h>
  23. #include <unicode/ustring.h>
  24. #include <unicode/utf16.h>
  25. typedef struct IcuTokenizer IcuTokenizer;
  26. typedef struct IcuCursor IcuCursor;
  27. struct IcuTokenizer {
  28. sqlite3_tokenizer base;
  29. char *zLocale;
  30. };
  31. struct IcuCursor {
  32. sqlite3_tokenizer_cursor base;
  33. UBreakIterator *pIter; /* ICU break-iterator object */
  34. int nChar; /* Number of UChar elements in pInput */
  35. UChar *aChar; /* Copy of input using utf-16 encoding */
  36. int *aOffset; /* Offsets of each character in utf-8 input */
  37. int nBuffer;
  38. char *zBuffer;
  39. int iToken;
  40. };
  41. /*
  42. ** Create a new tokenizer instance.
  43. */
  44. static int icuCreate(
  45. int argc, /* Number of entries in argv[] */
  46. const char * const *argv, /* Tokenizer creation arguments */
  47. sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
  48. ){
  49. IcuTokenizer *p;
  50. int n = 0;
  51. if( argc>0 ){
  52. n = strlen(argv[0])+1;
  53. }
  54. p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
  55. if( !p ){
  56. return SQLITE_NOMEM;
  57. }
  58. memset(p, 0, sizeof(IcuTokenizer));
  59. if( n ){
  60. p->zLocale = (char *)&p[1];
  61. memcpy(p->zLocale, argv[0], n);
  62. }
  63. *ppTokenizer = (sqlite3_tokenizer *)p;
  64. return SQLITE_OK;
  65. }
  66. /*
  67. ** Destroy a tokenizer
  68. */
  69. static int icuDestroy(sqlite3_tokenizer *pTokenizer){
  70. IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  71. sqlite3_free(p);
  72. return SQLITE_OK;
  73. }
  74. /*
  75. ** Prepare to begin tokenizing a particular string. The input
  76. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  77. ** used to incrementally tokenize this string is returned in
  78. ** *ppCursor.
  79. */
  80. static int icuOpen(
  81. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  82. const char *zInput, /* Input string */
  83. int nInput, /* Length of zInput in bytes */
  84. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  85. ){
  86. IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  87. IcuCursor *pCsr;
  88. const int32_t opt = U_FOLD_CASE_DEFAULT;
  89. UErrorCode status = U_ZERO_ERROR;
  90. int nChar;
  91. UChar32 c;
  92. int iInput = 0;
  93. int iOut = 0;
  94. *ppCursor = 0;
  95. if( -1 == nInput ) nInput = strlen(nInput);
  96. nChar = nInput+1;
  97. pCsr = (IcuCursor *)sqlite3_malloc(
  98. sizeof(IcuCursor) + /* IcuCursor */
  99. nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
  100. (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
  101. );
  102. if( !pCsr ){
  103. return SQLITE_NOMEM;
  104. }
  105. memset(pCsr, 0, sizeof(IcuCursor));
  106. pCsr->aChar = (UChar *)&pCsr[1];
  107. pCsr->aOffset = (int *)&pCsr->aChar[nChar];
  108. pCsr->aOffset[iOut] = iInput;
  109. U8_NEXT(zInput, iInput, nInput, c);
  110. while( c>0 ){
  111. int isError = 0;
  112. c = u_foldCase(c, opt);
  113. U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
  114. if( isError ){
  115. sqlite3_free(pCsr);
  116. return SQLITE_ERROR;
  117. }
  118. pCsr->aOffset[iOut] = iInput;
  119. if( iInput<nInput ){
  120. U8_NEXT(zInput, iInput, nInput, c);
  121. }else{
  122. c = 0;
  123. }
  124. }
  125. pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
  126. if( !U_SUCCESS(status) ){
  127. sqlite3_free(pCsr);
  128. return SQLITE_ERROR;
  129. }
  130. pCsr->nChar = iOut;
  131. ubrk_first(pCsr->pIter);
  132. *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
  133. return SQLITE_OK;
  134. }
  135. /*
  136. ** Close a tokenization cursor previously opened by a call to icuOpen().
  137. */
  138. static int icuClose(sqlite3_tokenizer_cursor *pCursor){
  139. IcuCursor *pCsr = (IcuCursor *)pCursor;
  140. ubrk_close(pCsr->pIter);
  141. sqlite3_free(pCsr->zBuffer);
  142. sqlite3_free(pCsr);
  143. return SQLITE_OK;
  144. }
  145. /*
  146. ** Extract the next token from a tokenization cursor.
  147. */
  148. static int icuNext(
  149. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
  150. const char **ppToken, /* OUT: *ppToken is the token text */
  151. int *pnBytes, /* OUT: Number of bytes in token */
  152. int *piStartOffset, /* OUT: Starting offset of token */
  153. int *piEndOffset, /* OUT: Ending offset of token */
  154. int *piPosition /* OUT: Position integer of token */
  155. ){
  156. IcuCursor *pCsr = (IcuCursor *)pCursor;
  157. int iStart = 0;
  158. int iEnd = 0;
  159. int nByte = 0;
  160. while( iStart==iEnd ){
  161. UChar32 c;
  162. iStart = ubrk_current(pCsr->pIter);
  163. iEnd = ubrk_next(pCsr->pIter);
  164. if( iEnd==UBRK_DONE ){
  165. return SQLITE_DONE;
  166. }
  167. while( iStart<iEnd ){
  168. int iWhite = iStart;
  169. U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
  170. if( u_isspace(c) ){
  171. iStart = iWhite;
  172. }else{
  173. break;
  174. }
  175. }
  176. assert(iStart<=iEnd);
  177. }
  178. do {
  179. UErrorCode status = U_ZERO_ERROR;
  180. if( nByte ){
  181. char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
  182. if( !zNew ){
  183. return SQLITE_NOMEM;
  184. }
  185. pCsr->zBuffer = zNew;
  186. pCsr->nBuffer = nByte;
  187. }
  188. u_strToUTF8(
  189. pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
  190. &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
  191. &status /* Output success/failure */
  192. );
  193. } while( nByte>pCsr->nBuffer );
  194. *ppToken = pCsr->zBuffer;
  195. *pnBytes = nByte;
  196. *piStartOffset = pCsr->aOffset[iStart];
  197. *piEndOffset = pCsr->aOffset[iEnd];
  198. *piPosition = pCsr->iToken++;
  199. return SQLITE_OK;
  200. }
  201. /*
  202. ** The set of routines that implement the simple tokenizer
  203. */
  204. static const sqlite3_tokenizer_module icuTokenizerModule = {
  205. 0, /* iVersion */
  206. icuCreate, /* xCreate */
  207. icuDestroy, /* xCreate */
  208. icuOpen, /* xOpen */
  209. icuClose, /* xClose */
  210. icuNext, /* xNext */
  211. };
  212. /*
  213. ** Set *ppModule to point at the implementation of the ICU tokenizer.
  214. */
  215. void sqlite3Fts3IcuTokenizerModule(
  216. sqlite3_tokenizer_module const**ppModule
  217. ){
  218. *ppModule = &icuTokenizerModule;
  219. }
  220. #endif /* defined(SQLITE_ENABLE_ICU) */
  221. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */