PageRenderTime 50ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/core/src/friso_lexicon.c

https://bitbucket.org/linju_tony/test
C | 373 lines | 245 code | 72 blank | 56 comment | 125 complexity | c9db554a7c8763446c33a4ab95c256fa MD5 | raw file
  1. /*
  2. * friso lexicon implemented functions.
  3. * used to deal with the friso lexicon, like: load,remove,match...
  4. *
  5. * @author chenxin
  6. * @email chenxin619315@gmail.com
  7. */
  8. #include "friso_API.h"
  9. #include "friso.h"
  10. #include <stdlib.h>
  11. #include <string.h>
  12. #define __SPLIT_MAX_TOKENS__ 5
  13. #define __LEX_FILE_DELIME__ '#'
  14. #define __FRISO_LEX_IFILE__ "friso.lex.ini"
  15. //create a new lexicon
  16. __EXTERN_API__ friso_dic_t friso_dic_new() {
  17. register uint_t t;
  18. friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC( sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
  19. if ( dic == NULL ) {
  20. ___ALLOCATION_ERROR___
  21. }
  22. for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
  23. dic[t] = new_hash_table();
  24. }
  25. return dic;
  26. }
  27. __EXTERN_API__ void friso_dic_free( friso_dic_t dic ) {
  28. register uint_t t;
  29. for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
  30. free_hash_table( dic[t], 1 );
  31. }
  32. FRISO_FREE( dic );
  33. }
  34. //create a new lexicon entry
  35. __EXTERN_API__ lex_entry_t new_lex_entry( string word, string syn, uint_t fre, char length, char type ) {
  36. lex_entry_t e = ( lex_entry_t ) FRISO_MALLOC( sizeof( lex_entry_cdt ) );
  37. if ( e == NULL ) {
  38. ___ALLOCATION_ERROR___
  39. }
  40. //initialize.
  41. e->word = word;
  42. e->syn = syn;
  43. e->fre = fre;
  44. e->length = length;
  45. e->type = type;
  46. return e;
  47. }
  48. //free the given lexicon entry.
  49. __EXTERN_API__ void free_lex_entry( lex_entry_t e ) {
  50. FRISO_FREE( e );
  51. }
  52. //add a new entry to the dictionary.
  53. __EXTERN_API__ void friso_dic_add( friso_dic_t dic, friso_lex_t lex,
  54. string word, string syn ) {
  55. if ( lex >= 0 || lex < __FRISO_LEXICON_LENGTH__ ) {
  56. //printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
  57. hash_put_mapping( dic[lex], word, new_lex_entry( word, syn, 0, ( char ) strlen(word), ( char ) lex ) );
  58. }
  59. }
  60. __EXTERN_API__ void friso_dic_add_with_fre( friso_dic_t dic, friso_lex_t lex,
  61. string word, string syn, uint_t frequency ) {
  62. if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
  63. hash_put_mapping( dic[lex], word, new_lex_entry( word, syn, frequency, ( char ) strlen(word), ( char ) lex ) );
  64. }
  65. }
  66. /*
  67. * read a line from a specified string.
  68. * the newline will be cleared.
  69. *
  70. * @date 2012-11-24
  71. */
  72. __EXTERN_API__ string file_get_line( string __dst, FILE * _stream ) {
  73. register int c;
  74. string cs;
  75. cs = __dst;
  76. while ( ( c = fgetc( _stream ) ) != EOF ) {
  77. if ( c == '\n' ) break;
  78. *cs++ = c;
  79. }
  80. *cs = '\0';
  81. return ( c == EOF && cs == __dst ) ? NULL : __dst;
  82. }
  83. /*
  84. * static function to copy a string.
  85. */
  86. ///instead of memcpy
  87. __STATIC_API__ string string_copy( string _src, string __dst, uint_t blocks ) {
  88. register string __src = _src;
  89. register uint_t t;
  90. for ( t = 0; t < blocks; t++ ) {
  91. if ( *__src == '\0' ) break;
  92. __dst[t] = *__src++;
  93. }
  94. __dst[t] = '\0';
  95. return __dst;
  96. }
  97. /*
  98. * find the postion of the first appear of the given char.
  99. * adress of the char in the string will be return
  100. * if found or return NULL
  101. */
  102. __STATIC_API__ string indexOf( string __str, char delimiter ) {
  103. uint_t i, __length__;
  104. __length__ = strlen( __str );
  105. for ( i = 0; i < __length__; i++ ) {
  106. if ( __str[i] == delimiter )
  107. return __str + i;
  108. }
  109. return NULL;
  110. }
  111. /*
  112. * split the given string with the given delimiter.
  113. *
  114. * @return the pointer of all the tokens.
  115. */
  116. __STATIC_API__ string * string_split( string *__tokens, string str, const char delim ) {
  117. register uint_t t, _toks = 0, \
  118. length = strlen( str );
  119. string ch, s = str;
  120. //clear the tokens
  121. for ( t = 0; t < __SPLIT_MAX_TOKENS__; t++ ) {
  122. __tokens[t] = NULL;
  123. }
  124. //get the number of the tokens
  125. for ( t = 0; t < length; t++ ) {
  126. if ( str[t] == delim )
  127. _toks++;
  128. }
  129. if ( _toks > 0 ) {
  130. _toks = 0;
  131. while ( ( ch = indexOf(s, delim) ) != NULL ) {
  132. t = ch - s + 1;
  133. __tokens[_toks] = string_copy( s, ( string ) FRISO_MALLOC( t ), t - 1);
  134. s = ch + 1;
  135. _toks++;
  136. if ( _toks >= __SPLIT_MAX_TOKENS__ ) {
  137. break;
  138. }
  139. }
  140. //have not reach the end, read the left chars.
  141. if ( _toks < __SPLIT_MAX_TOKENS__ && *s != '\0' ) {
  142. t = str + length - s + 1;
  143. __tokens[_toks] = string_copy( s, ( string ) FRISO_MALLOC( t ), t - 1);
  144. }
  145. } else {
  146. //a string without tokens.
  147. t = length + 1;
  148. __tokens[_toks] = string_copy( s, ( string ) FRISO_MALLOC( t ), t - 1);
  149. }
  150. return __tokens;
  151. }
  152. //load words from a lexicon file.
  153. __EXTERN_API__ void friso_dic_load( friso_dic_t dic, friso_lex_t lex, \
  154. string lex_file, uint_t length ) {
  155. FILE * _stream;
  156. register uint_t t;
  157. char __char[1024];
  158. string _line, __tokens[__SPLIT_MAX_TOKENS__];
  159. if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL ) {
  160. while ( ( _line = file_get_line( __char, _stream ) ) != NULL ) {
  161. //passing the lexicon notes.
  162. if ( indexOf( _line, __LEX_FILE_DELIME__ ) == _line ) continue;
  163. string_split( __tokens, _line, '/' );
  164. if ( __tokens[0] != NULL ) {
  165. if ( strlen( __tokens[0] ) <= length ) {
  166. if ( __tokens[2] != NULL ) { //word freedom frequency
  167. friso_dic_add_with_fre( dic, lex,
  168. __tokens[0],
  169. __tokens[1], atoi( __tokens[2] ) );
  170. } else {
  171. friso_dic_add( dic, lex,
  172. __tokens[0], __tokens[1] );
  173. }
  174. } else {
  175. FRISO_FREE( __tokens[0] );
  176. }
  177. }
  178. //free the useless allocations.
  179. for ( t = 2; t < __SPLIT_MAX_TOKENS__; t++ ) {
  180. if ( __tokens[t] != NULL )
  181. FRISO_FREE( __tokens[t] );
  182. }
  183. }
  184. fclose( _stream );
  185. }
  186. }
  187. __STATIC_API__ int get_lexicon_type_with_constant( string _key ) {
  188. if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
  189. return __LEX_CJK_WORDS__;
  190. }
  191. else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
  192. return __LEX_CJK_UNITS__;
  193. }
  194. else if ( strcmp( _key, "__LEX_MIX_WORDS__" ) == 0 ) {
  195. return __LEX_MIX_WORDS__;
  196. }
  197. else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
  198. return __LEX_CN_LNAME__;
  199. }
  200. else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
  201. return __LEX_CN_SNAME__;
  202. }
  203. else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
  204. return __LEX_CN_DNAME1__;
  205. }
  206. else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
  207. return __LEX_CN_DNAME2__;
  208. }
  209. else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
  210. return __LEX_CN_LNA__;
  211. }
  212. return -1;
  213. }
  214. /*
  215. * load the lexicon configuration file.
  216. * and load all the valid lexicon from the configuration file.
  217. */
  218. __EXTERN_API__ void friso_dic_load_from_ifile( friso_dic_t dic, string _path, uint_t _limits ) {
  219. //1.parse the configuration file.
  220. FILE * __stream;
  221. char __chars__[1024], __key__[30], *__line__;
  222. uint_t __length__, i, t;
  223. friso_lex_t lex_t;
  224. string_buffer_t sb;
  225. sb = new_string_buffer();
  226. string_buffer_append( sb, _path );
  227. string_buffer_append( sb, __FRISO_LEX_IFILE__ );
  228. if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL ) {
  229. while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) {
  230. //comment filter.
  231. if ( __line__[0] == '#' ) continue;
  232. if ( __line__[0] == '\0' ) continue;
  233. __length__ = strlen( __line__ );
  234. //item start
  235. if ( __line__[ __length__ - 1 ] == '[' ) {
  236. //get the type key
  237. for ( i = 0; i < __length__
  238. && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
  239. for ( t = 0; i < __length__; i++,t++ ) {
  240. if ( __line__[i] == ' ' || __line__[i] == '\t' || __line__[i] == ':' ) break;
  241. __key__[t] = __line__[i];
  242. }
  243. __key__[t] = '\0';
  244. //get the lexicon type
  245. lex_t = get_lexicon_type_with_constant(__key__);
  246. if ( lex_t == -1 ) continue;
  247. //printf("key=%s, type=%d\n", __key__, lex_t );
  248. while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) {
  249. //comments filter.
  250. if ( __line__[0] == '#' ) continue;
  251. if ( __line__[0] == '\0' ) continue;
  252. __length__ = strlen( __line__ );
  253. if ( __line__[ __length__ - 1 ] == ']' ) break;
  254. for ( i = 0; i < __length__
  255. && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
  256. for ( t = 0; i < __length__; i++,t++ ) {
  257. if ( __line__[i] == ' ' || __line__[i] == '\t' || __line__[i] == ';' ) break;
  258. __key__[t] = __line__[i];
  259. }
  260. __key__[t] = '\0';
  261. //load the lexicon item from the lexicon file.
  262. string_buffer_clear( sb );
  263. string_buffer_append( sb, _path );
  264. string_buffer_append( sb, __key__ );
  265. friso_dic_load( dic, lex_t, sb->buffer, _limits );
  266. }
  267. }
  268. } //end while
  269. fclose( __stream );
  270. }
  271. free_string_buffer(sb);
  272. }
  273. //match the item.
  274. __EXTERN_API__ int friso_dic_match( friso_dic_t dic, friso_lex_t lex, string word ) {
  275. if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
  276. return hash_exist_mapping( dic[lex], word );
  277. }
  278. return 0;
  279. }
  280. //get the lex_entry_t associated with the word.
  281. __EXTERN_API__ lex_entry_t friso_dic_get( friso_dic_t dic, friso_lex_t lex, string word ) {
  282. if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
  283. return ( lex_entry_t ) hash_get_value( dic[lex], word );
  284. }
  285. return NULL;
  286. }
  287. //get the size of the specified type dictionary.
  288. __EXTERN_API__ uint_t friso_spec_dic_size( friso_dic_t dic, friso_lex_t lex ) {
  289. if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
  290. return hash_get_size( dic[lex] );
  291. }
  292. return 0;
  293. }
  294. //get size of the whole dictionary.
  295. __EXTERN_API__ uint_t friso_all_dic_size( friso_dic_t dic ) {
  296. register uint_t size = 0, t;
  297. for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
  298. size += hash_get_size( dic[t] );
  299. }
  300. return size;
  301. }