/vendor/gems/ferret-0.11.4/ext/analysis.c

https://github.com/ekcell/lovdbyless · C · 1552 lines · 1171 code · 208 blank · 173 comment · 319 complexity · c32c5f8f490434c06753681a5ee4d6de MD5 · raw file

  1. #include "analysis.h"
  2. #include "hash.h"
  3. #include <libstemmer.h>
  4. #include <string.h>
  5. #include <ctype.h>
  6. #include <wctype.h>
  7. #include <wchar.h>
  8. /****************************************************************************
  9. *
  10. * Token
  11. *
  12. ****************************************************************************/
  13. __inline Token *tk_set(Token *tk,
  14. char *text, int tlen, int start, int end, int pos_inc)
  15. {
  16. if (tlen >= MAX_WORD_SIZE) {
  17. tlen = MAX_WORD_SIZE - 1;
  18. }
  19. memcpy(tk->text, text, sizeof(char) * tlen);
  20. tk->text[tlen] = '\0';
  21. tk->len = tlen;
  22. tk->start = start;
  23. tk->end = end;
  24. tk->pos_inc = pos_inc;
  25. return tk;
  26. }
  27. __inline Token *tk_set_ts(Token *tk,
  28. char *start, char *end, char *text, int pos_inc)
  29. {
  30. return tk_set(tk, start, (int)(end - start),
  31. (int)(start - text), (int)(end - text), pos_inc);
  32. }
  33. __inline Token *tk_set_no_len(Token *tk,
  34. char *text, int start, int end, int pos_inc)
  35. {
  36. return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
  37. }
  38. __inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
  39. int pos_inc)
  40. {
  41. int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
  42. tk->text[len] = '\0';
  43. tk->len = len;
  44. tk->start = start;
  45. tk->end = end;
  46. tk->pos_inc = pos_inc;
  47. return tk;
  48. }
  49. int tk_eq(Token *tk1, Token *tk2)
  50. {
  51. return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
  52. tk1->start == tk2->start && tk1->end == tk2->end &&
  53. tk1->pos_inc == tk2->pos_inc);
  54. }
  55. int tk_cmp(Token *tk1, Token *tk2)
  56. {
  57. int cmp;
  58. if (tk1->start > tk2->start) {
  59. cmp = 1;
  60. }
  61. else if (tk1->start < tk2->start) {
  62. cmp = -1;
  63. }
  64. else {
  65. if (tk1->end > tk2->end) {
  66. cmp = 1;
  67. }
  68. else if (tk1->end < tk2->end) {
  69. cmp = -1;
  70. }
  71. else {
  72. cmp = strcmp((char *)tk1->text, (char *)tk2->text);
  73. }
  74. }
  75. return cmp;
  76. }
  77. void tk_destroy(void *p)
  78. {
  79. free(p);
  80. }
  81. Token *tk_new()
  82. {
  83. return ALLOC(Token);
  84. }
  85. /****************************************************************************
  86. *
  87. * TokenStream
  88. *
  89. ****************************************************************************/
  90. void ts_deref(TokenStream *ts)
  91. {
  92. if (--ts->ref_cnt <= 0) {
  93. ts->destroy_i(ts);
  94. }
  95. }
  96. static TokenStream *ts_reset(TokenStream *ts, char *text)
  97. {
  98. ts->t = ts->text = text;
  99. return ts;
  100. }
  101. TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
  102. {
  103. TokenStream *ts = (TokenStream *)ecalloc(size);
  104. memcpy(ts, orig_ts, size);
  105. ts->ref_cnt = 1;
  106. return ts;
  107. }
  108. TokenStream *ts_new_i(size_t size)
  109. {
  110. TokenStream *ts = ecalloc(size);
  111. ts->destroy_i = (void (*)(TokenStream *))&free;
  112. ts->reset = &ts_reset;
  113. ts->ref_cnt = 1;
  114. return ts;
  115. }
  116. /****************************************************************************
  117. * CachedTokenStream
  118. ****************************************************************************/
  119. #define CTS(token_stream) ((CachedTokenStream *)(token_stream))
  120. static TokenStream *cts_clone_i(TokenStream *orig_ts)
  121. {
  122. return ts_clone_size(orig_ts, sizeof(CachedTokenStream));
  123. }
  124. static TokenStream *cts_new()
  125. {
  126. TokenStream *ts = ts_new(CachedTokenStream);
  127. ts->clone_i = &cts_clone_i;
  128. return ts;
  129. }
  130. /* * Multi-byte TokenStream * */
  131. #define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
  132. __inline int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
  133. {
  134. int num_bytes;
  135. if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
  136. const char *t = s;
  137. do {
  138. t++;
  139. ZEROSET(state, mbstate_t);
  140. num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
  141. } while ((num_bytes < 0) && (*t != 0));
  142. num_bytes = t - s;
  143. if (*t == 0) *wchr = 0;
  144. }
  145. return num_bytes;
  146. }
  147. static TokenStream *mb_ts_reset(TokenStream *ts, char *text)
  148. {
  149. ZEROSET(&(MBTS(ts)->state), mbstate_t);
  150. ts_reset(ts, text);
  151. return ts;
  152. }
  153. static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
  154. {
  155. return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
  156. }
  157. TokenStream *mb_ts_new()
  158. {
  159. TokenStream *ts = ts_new(MultiByteTokenStream);
  160. ts->reset = &mb_ts_reset;
  161. ts->clone_i = &mb_ts_clone_i;
  162. ts->ref_cnt = 1;
  163. return ts;
  164. }
  165. /****************************************************************************
  166. *
  167. * Analyzer
  168. *
  169. ****************************************************************************/
  170. void a_deref(Analyzer *a)
  171. {
  172. if (--a->ref_cnt <= 0) {
  173. a->destroy_i(a);
  174. }
  175. }
  176. static void a_standard_destroy_i(Analyzer *a)
  177. {
  178. if (a->current_ts) {
  179. ts_deref(a->current_ts);
  180. }
  181. free(a);
  182. }
  183. static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
  184. {
  185. TokenStream *ts;
  186. (void)field;
  187. ts = ts_clone(a->current_ts);
  188. return ts->reset(ts, text);
  189. }
  190. Analyzer *analyzer_new(TokenStream *ts,
  191. void (*destroy_i)(Analyzer *a),
  192. TokenStream *(*get_ts)(Analyzer *a, char *field,
  193. char *text))
  194. {
  195. Analyzer *a = ALLOC(Analyzer);
  196. a->current_ts = ts;
  197. a->destroy_i = (destroy_i ? destroy_i : &a_standard_destroy_i);
  198. a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
  199. a->ref_cnt = 1;
  200. return a;
  201. }
  202. /****************************************************************************
  203. *
  204. * Non
  205. *
  206. ****************************************************************************/
  207. /*
  208. * NonTokenizer
  209. */
  210. static Token *nt_next(TokenStream *ts)
  211. {
  212. if (ts->t) {
  213. size_t len = strlen(ts->t);
  214. ts->t = NULL;
  215. return tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
  216. }
  217. else {
  218. return NULL;
  219. }
  220. }
  221. TokenStream *non_tokenizer_new()
  222. {
  223. TokenStream *ts = cts_new();
  224. ts->next = &nt_next;
  225. return ts;
  226. }
  227. /*
  228. * NonAnalyzer
  229. */
  230. Analyzer *non_analyzer_new()
  231. {
  232. return analyzer_new(non_tokenizer_new(), NULL, NULL);
  233. }
  234. /****************************************************************************
  235. *
  236. * Whitespace
  237. *
  238. ****************************************************************************/
  239. /*
  240. * WhitespaceTokenizer
  241. */
  242. static Token *wst_next(TokenStream *ts)
  243. {
  244. char *t = ts->t;
  245. char *start;
  246. while (*t != '\0' && isspace(*t)) {
  247. t++;
  248. }
  249. if (*t == '\0') {
  250. return NULL;
  251. }
  252. start = t;
  253. while (*t != '\0' && !isspace(*t)) {
  254. t++;
  255. }
  256. ts->t = t;
  257. return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  258. }
  259. TokenStream *whitespace_tokenizer_new()
  260. {
  261. TokenStream *ts = cts_new();
  262. ts->next = &wst_next;
  263. return ts;
  264. }
  265. /*
  266. * Multi-byte WhitespaceTokenizer
  267. */
  268. static Token *mb_wst_next(TokenStream *ts)
  269. {
  270. int i;
  271. char *start;
  272. char *t = ts->t;
  273. wchar_t wchr;
  274. mbstate_t *state = &(MBTS(ts)->state);
  275. i = mb_next_char(&wchr, t, state);
  276. while (wchr != 0 && iswspace(wchr)) {
  277. t += i;
  278. i = mb_next_char(&wchr, t, state);
  279. }
  280. if (wchr == 0) {
  281. return NULL;
  282. }
  283. start = t;
  284. t += i;
  285. i = mb_next_char(&wchr, t, state);
  286. while (wchr != 0 && !iswspace(wchr)) {
  287. t += i;
  288. i = mb_next_char(&wchr, t, state);
  289. }
  290. ts->t = t;
  291. return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  292. }
  293. /*
  294. * Lowercasing Multi-byte WhitespaceTokenizer
  295. */
  296. static Token *mb_wst_next_lc(TokenStream *ts)
  297. {
  298. int i;
  299. char *start;
  300. char *t = ts->t;
  301. wchar_t wchr;
  302. wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
  303. mbstate_t *state = &(MBTS(ts)->state);
  304. w = wbuf;
  305. w_end = &wbuf[MAX_WORD_SIZE];
  306. i = mb_next_char(&wchr, t, state);
  307. while (wchr != 0 && iswspace(wchr)) {
  308. t += i;
  309. i = mb_next_char(&wchr, t, state);
  310. }
  311. if (wchr == 0) {
  312. return NULL;
  313. }
  314. start = t;
  315. t += i;
  316. *w++ = towlower(wchr);
  317. i = mb_next_char(&wchr, t, state);
  318. while (wchr != 0 && !iswspace(wchr)) {
  319. if (w < w_end) {
  320. *w++ = towlower(wchr);
  321. }
  322. t += i;
  323. i = mb_next_char(&wchr, t, state);
  324. }
  325. *w = 0;
  326. ts->t = t;
  327. return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
  328. (int)(t - ts->text), 1);
  329. }
  330. TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
  331. {
  332. TokenStream *ts = mb_ts_new();
  333. ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
  334. return ts;
  335. }
  336. /*
  337. * WhitespaceAnalyzers
  338. */
  339. Analyzer *whitespace_analyzer_new(bool lowercase)
  340. {
  341. TokenStream *ts;
  342. if (lowercase) {
  343. ts = lowercase_filter_new(whitespace_tokenizer_new());
  344. }
  345. else {
  346. ts = whitespace_tokenizer_new();
  347. }
  348. return analyzer_new(ts, NULL, NULL);
  349. }
  350. Analyzer *mb_whitespace_analyzer_new(bool lowercase)
  351. {
  352. return analyzer_new(mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
  353. }
  354. /****************************************************************************
  355. *
  356. * Letter
  357. *
  358. ****************************************************************************/
  359. /*
  360. * LetterTokenizer
  361. */
  362. Token *lt_next(TokenStream *ts)
  363. {
  364. char *start;
  365. char *t = ts->t;
  366. while (*t != '\0' && !isalpha(*t)) {
  367. t++;
  368. }
  369. if (*t == '\0') {
  370. return NULL;
  371. }
  372. start = t;
  373. while (*t != '\0' && isalpha(*t)) {
  374. t++;
  375. }
  376. ts->t = t;
  377. return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  378. }
  379. TokenStream *letter_tokenizer_new()
  380. {
  381. TokenStream *ts = cts_new();
  382. ts->next = &lt_next;
  383. return ts;
  384. }
  385. /*
  386. * Multi-byte LetterTokenizer
  387. */
  388. Token *mb_lt_next(TokenStream *ts)
  389. {
  390. int i;
  391. char *start;
  392. char *t = ts->t;
  393. wchar_t wchr;
  394. mbstate_t *state = &(MBTS(ts)->state);
  395. i = mb_next_char(&wchr, t, state);
  396. while (wchr != 0 && !iswalpha(wchr)) {
  397. t += i;
  398. i = mb_next_char(&wchr, t, state);
  399. }
  400. if (wchr == 0) {
  401. return NULL;
  402. }
  403. start = t;
  404. t += i;
  405. i = mb_next_char(&wchr, t, state);
  406. while (wchr != 0 && iswalpha(wchr)) {
  407. t += i;
  408. i = mb_next_char(&wchr, t, state);
  409. }
  410. ts->t = t;
  411. return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  412. }
  413. /*
  414. * Lowercasing Multi-byte LetterTokenizer
  415. */
  416. Token *mb_lt_next_lc(TokenStream *ts)
  417. {
  418. int i;
  419. char *start;
  420. char *t = ts->t;
  421. wchar_t wchr;
  422. wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
  423. mbstate_t *state = &(MBTS(ts)->state);
  424. w = wbuf;
  425. w_end = &wbuf[MAX_WORD_SIZE];
  426. i = mb_next_char(&wchr, t, state);
  427. while (wchr != 0 && !iswalpha(wchr)) {
  428. t += i;
  429. i = mb_next_char(&wchr, t, state);
  430. }
  431. if (wchr == 0) {
  432. return NULL;
  433. }
  434. start = t;
  435. t += i;
  436. *w++ = towlower(wchr);
  437. i = mb_next_char(&wchr, t, state);
  438. while (wchr != 0 && iswalpha(wchr)) {
  439. if (w < w_end) {
  440. *w++ = towlower(wchr);
  441. }
  442. t += i;
  443. i = mb_next_char(&wchr, t, state);
  444. }
  445. *w = 0;
  446. ts->t = t;
  447. return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
  448. (int)(t - ts->text), 1);
  449. }
  450. TokenStream *mb_letter_tokenizer_new(bool lowercase)
  451. {
  452. TokenStream *ts = mb_ts_new();
  453. ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
  454. return ts;
  455. }
  456. /*
  457. * LetterAnalyzers
  458. */
  459. Analyzer *letter_analyzer_new(bool lowercase)
  460. {
  461. TokenStream *ts;
  462. if (lowercase) {
  463. ts = lowercase_filter_new(letter_tokenizer_new());
  464. }
  465. else {
  466. ts = letter_tokenizer_new();
  467. }
  468. return analyzer_new(ts, NULL, NULL);
  469. }
  470. Analyzer *mb_letter_analyzer_new(bool lowercase)
  471. {
  472. return analyzer_new(mb_letter_tokenizer_new(lowercase), NULL, NULL);
  473. }
  474. /****************************************************************************
  475. *
  476. * Standard
  477. *
  478. ****************************************************************************/
  479. #define STDTS(token_stream) ((StandardTokenizer *)(token_stream))
  480. /*
  481. * StandardTokenizer
  482. */
  483. static int std_get_alpha(TokenStream *ts, char *token)
  484. {
  485. int i = 0;
  486. char *t = ts->t;
  487. while (t[i] != '\0' && isalnum(t[i])) {
  488. if (i < MAX_WORD_SIZE) {
  489. token[i] = t[i];
  490. }
  491. i++;
  492. }
  493. return i;
  494. }
  495. static int mb_std_get_alpha(TokenStream *ts, char *token)
  496. {
  497. char *t = ts->t;
  498. wchar_t wchr;
  499. int i;
  500. mbstate_t state; ZEROSET(&state, mbstate_t);
  501. i = mb_next_char(&wchr, t, &state);
  502. while (wchr != 0 && iswalnum(wchr)) {
  503. t += i;
  504. i = mb_next_char(&wchr, t, &state);
  505. }
  506. i = (int)(t - ts->t);
  507. if (i > MAX_WORD_SIZE) {
  508. i = MAX_WORD_SIZE - 1;
  509. }
  510. memcpy(token, ts->t, i);
  511. return i;
  512. }
  513. /*
  514. static int std_get_alnum(TokenStream *ts, char *token)
  515. {
  516. int i = 0;
  517. char *t = ts->t;
  518. while (t[i] != '\0' && isalnum(t[i])) {
  519. if (i < MAX_WORD_SIZE) {
  520. token[i] = t[i];
  521. }
  522. i++;
  523. }
  524. return i;
  525. }
  526. static int mb_std_get_alnum(TokenStream *ts, char *token)
  527. {
  528. char *t = ts->t;
  529. wchar_t wchr;
  530. int i;
  531. mbstate_t state; ZEROSET(&state, mbstate_t);
  532. i = mb_next_char(&wchr, t, &state);
  533. while (wchr != 0 && iswalnum(wchr)) {
  534. t += i;
  535. i = mb_next_char(&wchr, t, &state);
  536. }
  537. i = (int)(t - ts->t);
  538. if (i > MAX_WORD_SIZE) {
  539. i = MAX_WORD_SIZE - 1;
  540. }
  541. memcpy(token, ts->t, i);
  542. return i;
  543. }
  544. */
  545. static int isnumpunc(char c)
  546. {
  547. return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
  548. || c == '-');
  549. }
  550. static int w_isnumpunc(wchar_t c)
  551. {
  552. return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
  553. || c == L'-');
  554. }
  555. static int isurlpunc(char c)
  556. {
  557. return (c == '.' || c == '/' || c == '-' || c == '_');
  558. }
  559. static int isurlc(char c)
  560. {
  561. return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
  562. }
  563. static int isurlxatpunc(char c)
  564. {
  565. return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
  566. }
  567. static int isurlxatc(char c)
  568. {
  569. return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
  570. || isalnum(c));
  571. }
  572. static bool std_is_tok_char(char *c)
  573. {
  574. if (isspace(*c)) {
  575. return false; /* most common so check first. */
  576. }
  577. if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
  578. *c == '@' || *c == '\'' || *c == ':') {
  579. return true;
  580. }
  581. return false;
  582. }
  583. static bool mb_std_is_tok_char(char *t)
  584. {
  585. wchar_t c;
  586. mbstate_t state; ZEROSET(&state, mbstate_t);
  587. if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
  588. /* error which we can handle next time round. For now just return
  589. * false so that we can return a token */
  590. return false;
  591. }
  592. if (iswspace(c)) {
  593. return false; /* most common so check first. */
  594. }
  595. if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
  596. || c == L':') {
  597. return true;
  598. }
  599. return false;
  600. }
  601. /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
  602. * least one digit.
  603. * (alnum) = [a-zA-Z0-9]
  604. * (punc) = [_\/.,-]
  605. */
  606. static int std_get_number(char *input)
  607. {
  608. int i = 0;
  609. int count = 0;
  610. int last_seen_digit = 2;
  611. int seen_digit = false;
  612. while (last_seen_digit >= 0) {
  613. while ((input[i] != '\0') && isalnum(input[i])) {
  614. if ((last_seen_digit < 2) && isdigit(input[i])) {
  615. last_seen_digit = 2;
  616. }
  617. if ((seen_digit == false) && isdigit(input[i])) {
  618. seen_digit = true;
  619. }
  620. i++;
  621. }
  622. last_seen_digit--;
  623. if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
  624. if (last_seen_digit >= 0) {
  625. count = i;
  626. }
  627. break;
  628. }
  629. count = i;
  630. i++;
  631. }
  632. if (seen_digit) {
  633. return count;
  634. }
  635. else {
  636. return 0;
  637. }
  638. }
  639. static int std_get_apostrophe(char *input)
  640. {
  641. char *t = input;
  642. while (isalpha(*t) || *t == '\'') {
  643. t++;
  644. }
  645. return (int)(t - input);
  646. }
  647. static int mb_std_get_apostrophe(char *input)
  648. {
  649. char *t = input;
  650. wchar_t wchr;
  651. int i;
  652. mbstate_t state; ZEROSET(&state, mbstate_t);
  653. i = mb_next_char(&wchr, t, &state);
  654. while (iswalpha(wchr) || wchr == L'\'') {
  655. t += i;
  656. i = mb_next_char(&wchr, t, &state);
  657. }
  658. return (int)(t - input);
  659. }
  660. static int std_get_url(char *input, char *token, int i)
  661. {
  662. while (isurlc(input[i])) {
  663. if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
  664. break; /* can't have two puncs in a row */
  665. }
  666. if (i < MAX_WORD_SIZE) {
  667. token[i] = input[i];
  668. }
  669. i++;
  670. }
  671. /* strip trailing puncs */
  672. while (isurlpunc(input[i - 1])) {
  673. i--;
  674. }
  675. return i;
  676. }
  677. /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
  678. */
  679. static int std_get_company_name(char *input)
  680. {
  681. int i = 0;
  682. while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
  683. i++;
  684. }
  685. return i;
  686. }
  687. /*
  688. static int mb_std_get_company_name(char *input, TokenStream *ts)
  689. {
  690. char *t = input;
  691. wchar_t wchr;
  692. int i;
  693. mbstate_t state; ZEROSET(&state, mbstate_t);
  694. i = mb_next_char(&wchr, t, &state);
  695. while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
  696. t += i;
  697. i = mb_next_char(&wchr, t, &state);
  698. }
  699. return (int)(t - input);
  700. }
  701. */
  702. static bool std_advance_to_start(TokenStream *ts)
  703. {
  704. char *t = ts->t;
  705. while (*t != '\0' && !isalnum(*t)) {
  706. if (isnumpunc(*t) && isdigit(t[1])) break;
  707. t++;
  708. }
  709. ts->t = t;
  710. return (*t != '\0');
  711. }
  712. static bool mb_std_advance_to_start(TokenStream *ts)
  713. {
  714. int i;
  715. wchar_t wchr;
  716. mbstate_t state; ZEROSET(&state, mbstate_t);
  717. i = mb_next_char(&wchr, ts->t, &state);
  718. while (wchr != 0 && !iswalpha(wchr) && !isdigit(*(ts->t))) {
  719. if (isnumpunc(*ts->t) && isdigit(ts->t[1])) break;
  720. ts->t += i;
  721. i = mb_next_char(&wchr, ts->t, &state);
  722. }
  723. return (wchr != 0);
  724. }
  725. static Token *std_next(TokenStream *ts)
  726. {
  727. StandardTokenizer *std_tz = STDTS(ts);
  728. char *s;
  729. char *t;
  730. char *start = NULL;
  731. char *num_end = NULL;
  732. char token[MAX_WORD_SIZE + 1];
  733. int token_i = 0;
  734. int len;
  735. bool is_acronym;
  736. bool seen_at_symbol;
  737. if (!std_tz->advance_to_start(ts)) {
  738. return NULL;
  739. }
  740. start = t = ts->t;
  741. token_i = std_tz->get_alpha(ts, token);
  742. t += token_i;
  743. if (!std_tz->is_tok_char(t)) {
  744. /* very common case, ie a plain word, so check and return */
  745. ts->t = t;
  746. return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  747. }
  748. if (*t == '\'') { /* apostrophe case. */
  749. t += std_tz->get_apostrophe(t);
  750. ts->t = t;
  751. len = (int)(t - start);
  752. /* strip possesive */
  753. if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
  754. t -= 2;
  755. tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  756. CTS(ts)->token.end += 2;
  757. }
  758. else if (t[-1] == '\'') {
  759. t -= 1;
  760. tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  761. CTS(ts)->token.end += 1;
  762. }
  763. else {
  764. tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  765. }
  766. return &(CTS(ts)->token);
  767. }
  768. if (*t == '&') { /* apostrophe case. */
  769. t += std_get_company_name(t);
  770. ts->t = t;
  771. return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  772. }
  773. if ((isdigit(*t) || isnumpunc(*t)) /* possibly a number */
  774. && (len = std_get_number(t) > 0)) {
  775. num_end = start + len;
  776. if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
  777. ts->t = num_end;
  778. return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
  779. }
  780. /* else there may be a longer token so check */
  781. }
  782. if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
  783. /* check for a known url start */
  784. token[token_i] = '\0';
  785. t += 3;
  786. while (*t == '/') {
  787. t++;
  788. }
  789. if (isalpha(*t) &&
  790. (memcmp(token, "ftp", 3) == 0 ||
  791. memcmp(token, "http", 4) == 0 ||
  792. memcmp(token, "https", 5) == 0 ||
  793. memcmp(token, "file", 4) == 0)) {
  794. len = std_get_url(t, token, 0); /* dispose of first part of the URL */
  795. }
  796. else { /* still treat as url but keep the first part */
  797. token_i = (int)(t - start);
  798. memcpy(token, start, token_i * sizeof(char));
  799. len = token_i + std_get_url(t, token, token_i); /* keep start */
  800. }
  801. ts->t = t + len;
  802. token[len] = 0;
  803. return tk_set(&(CTS(ts)->token), token, len, (int)(start - ts->text),
  804. (int)(ts->t - ts->text), 1);
  805. }
  806. /* now see how long a url we can find. */
  807. is_acronym = true;
  808. seen_at_symbol = false;
  809. while (isurlxatc(*t)) {
  810. if (is_acronym && !isalpha(*t) && (*t != '.')) {
  811. is_acronym = false;
  812. }
  813. if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
  814. break; /* can't have two punctuation characters in a row */
  815. }
  816. if (*t == '@') {
  817. if (seen_at_symbol) {
  818. break; /* we can only have one @ symbol */
  819. }
  820. else {
  821. seen_at_symbol = true;
  822. }
  823. }
  824. t++;
  825. }
  826. while (isurlxatpunc(t[-1])) {
  827. t--; /* strip trailing punctuation */
  828. }
  829. if (num_end == NULL || t > num_end) {
  830. ts->t = t;
  831. if (is_acronym) { /* check it is one letter followed by one '.' */
  832. for (s = start; s < t - 1; s++) {
  833. if (isalpha(*s) && (s[1] != '.'))
  834. is_acronym = false;
  835. }
  836. }
  837. if (is_acronym) { /* strip '.'s */
  838. for (s = start + token_i; s < t; s++) {
  839. if (*s != '.') {
  840. token[token_i] = *s;
  841. token_i++;
  842. }
  843. }
  844. tk_set(&(CTS(ts)->token), token, token_i,
  845. (int)(start - ts->text),
  846. (int)(t - ts->text), 1);
  847. }
  848. else { /* just return the url as is */
  849. tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
  850. }
  851. }
  852. else { /* return the number */
  853. ts->t = num_end;
  854. tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
  855. }
  856. return &(CTS(ts)->token);
  857. }
  858. static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
  859. {
  860. return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
  861. }
  862. static TokenStream *std_ts_new()
  863. {
  864. TokenStream *ts = ts_new(StandardTokenizer);
  865. ts->clone_i = &std_ts_clone_i;
  866. ts->next = &std_next;
  867. return ts;
  868. }
  869. TokenStream *standard_tokenizer_new()
  870. {
  871. TokenStream *ts = std_ts_new();
  872. STDTS(ts)->advance_to_start = &std_advance_to_start;
  873. STDTS(ts)->get_alpha = &std_get_alpha;
  874. STDTS(ts)->is_tok_char = &std_is_tok_char;
  875. STDTS(ts)->get_apostrophe = &std_get_apostrophe;
  876. return ts;
  877. }
  878. TokenStream *mb_standard_tokenizer_new()
  879. {
  880. TokenStream *ts = std_ts_new();
  881. STDTS(ts)->advance_to_start = &mb_std_advance_to_start;
  882. STDTS(ts)->get_alpha = &mb_std_get_alpha;
  883. STDTS(ts)->is_tok_char = &mb_std_is_tok_char;
  884. STDTS(ts)->get_apostrophe = &mb_std_get_apostrophe;
  885. return ts;
  886. }
  887. /****************************************************************************
  888. *
  889. * Filters
  890. *
  891. ****************************************************************************/
  892. #define TkFilt(filter) ((TokenFilter *)(filter))
  893. TokenStream *filter_clone_size(TokenStream *ts, size_t size)
  894. {
  895. TokenStream *ts_new = ts_clone_size(ts, size);
  896. TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
  897. return ts_new;
  898. }
  899. static TokenStream *filter_clone_i(TokenStream *ts)
  900. {
  901. return filter_clone_size(ts, sizeof(TokenFilter));
  902. }
  903. static TokenStream *filter_reset(TokenStream *ts, char *text)
  904. {
  905. TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
  906. return ts;
  907. }
  908. static void filter_destroy_i(TokenStream *ts)
  909. {
  910. ts_deref(TkFilt(ts)->sub_ts);
  911. free(ts);
  912. }
  913. #define tf_new(type, sub) tf_new_i(sizeof(type), sub)
  914. TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
  915. {
  916. TokenStream *ts = (TokenStream *)ecalloc(size);
  917. TkFilt(ts)->sub_ts = sub_ts;
  918. ts->clone_i = &filter_clone_i;
  919. ts->destroy_i = &filter_destroy_i;
  920. ts->reset = &filter_reset;
  921. ts->ref_cnt = 1;
  922. return ts;
  923. }
  924. /****************************************************************************
  925. * StopFilter
  926. ****************************************************************************/
  927. #define StopFilt(filter) ((StopFilter *)(filter))
  928. static void sf_destroy_i(TokenStream *ts)
  929. {
  930. h_destroy(StopFilt(ts)->words);
  931. filter_destroy_i(ts);
  932. }
  933. static TokenStream *sf_clone_i(TokenStream *orig_ts)
  934. {
  935. TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
  936. REF(StopFilt(new_ts)->words);
  937. return new_ts;
  938. }
  939. static Token *sf_next(TokenStream *ts)
  940. {
  941. int pos_inc = 0;
  942. HashTable *words = StopFilt(ts)->words;
  943. TokenFilter *tf = TkFilt(ts);
  944. Token *tk = tf->sub_ts->next(tf->sub_ts);
  945. while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
  946. pos_inc += tk->pos_inc;
  947. tk = tf->sub_ts->next(tf->sub_ts);
  948. }
  949. if (tk != NULL) {
  950. tk->pos_inc += pos_inc;
  951. }
  952. return tk;
  953. }
  954. TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
  955. const char **words, int len)
  956. {
  957. int i;
  958. char *word;
  959. HashTable *word_table = h_new_str(&free, (free_ft) NULL);
  960. TokenStream *ts = tf_new(StopFilter, sub_ts);
  961. for (i = 0; i < len; i++) {
  962. word = estrdup(words[i]);
  963. h_set(word_table, word, word);
  964. }
  965. StopFilt(ts)->words = word_table;
  966. ts->next = &sf_next;
  967. ts->destroy_i = &sf_destroy_i;
  968. ts->clone_i = &sf_clone_i;
  969. return ts;
  970. }
  971. TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
  972. const char **words)
  973. {
  974. char *word;
  975. HashTable *word_table = h_new_str(&free, (free_ft) NULL);
  976. TokenStream *ts = tf_new(StopFilter, sub_ts);
  977. while (*words) {
  978. word = estrdup(*words);
  979. h_set(word_table, word, word);
  980. words++;
  981. }
  982. StopFilt(ts)->words = word_table;
  983. ts->next = &sf_next;
  984. ts->destroy_i = &sf_destroy_i;
  985. ts->clone_i = &sf_clone_i;
  986. return ts;
  987. }
  988. TokenStream *stop_filter_new(TokenStream *ts)
  989. {
  990. return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
  991. }
  992. /****************************************************************************
  993. * MappingFilter
  994. ****************************************************************************/
  995. #define MFilt(filter) ((MappingFilter *)(filter))
  996. static void mf_destroy_i(TokenStream *ts)
  997. {
  998. mulmap_destroy(MFilt(ts)->mapper);
  999. filter_destroy_i(ts);
  1000. }
  1001. static TokenStream *mf_clone_i(TokenStream *orig_ts)
  1002. {
  1003. TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
  1004. REF(MFilt(new_ts)->mapper);
  1005. return new_ts;
  1006. }
  1007. static Token *mf_next(TokenStream *ts)
  1008. {
  1009. char buf[MAX_WORD_SIZE + 1];
  1010. MultiMapper *mapper = MFilt(ts)->mapper;
  1011. TokenFilter *tf = TkFilt(ts);
  1012. Token *tk = tf->sub_ts->next(tf->sub_ts);
  1013. if (tk != NULL) {
  1014. tk->len = mulmap_map_len(mapper, buf, tk->text, MAX_WORD_SIZE);
  1015. memcpy(tk->text, buf, tk->len + 1);
  1016. }
  1017. return tk;
  1018. }
  1019. static TokenStream *mf_reset(TokenStream *ts, char *text)
  1020. {
  1021. MultiMapper *mm = MFilt(ts)->mapper;
  1022. if (mm->d_size == 0) {
  1023. mulmap_compile(MFilt(ts)->mapper);
  1024. }
  1025. filter_reset(ts, text);
  1026. return ts;
  1027. }
  1028. TokenStream *mapping_filter_new(TokenStream *sub_ts)
  1029. {
  1030. TokenStream *ts = tf_new(MappingFilter, sub_ts);
  1031. MFilt(ts)->mapper = mulmap_new();
  1032. ts->next = &mf_next;
  1033. ts->destroy_i = &mf_destroy_i;
  1034. ts->clone_i = &mf_clone_i;
  1035. ts->reset = &mf_reset;
  1036. return ts;
  1037. }
  1038. TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
  1039. const char *replacement)
  1040. {
  1041. mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
  1042. return ts;
  1043. }
  1044. /****************************************************************************
  1045. * HyphenFilter
  1046. ****************************************************************************/
  1047. #define HyphenFilt(filter) ((HyphenFilter *)(filter))
  1048. static TokenStream *hf_clone_i(TokenStream *orig_ts)
  1049. {
  1050. TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
  1051. return new_ts;
  1052. }
  1053. static Token *hf_next(TokenStream *ts)
  1054. {
  1055. HyphenFilter *hf = HyphenFilt(ts);
  1056. TokenFilter *tf = TkFilt(ts);
  1057. Token *tk = hf->tk;
  1058. if (hf->pos < hf->len) {
  1059. const int pos = hf->pos;
  1060. const int text_len = strlen(hf->text + pos);
  1061. strcpy(tk->text, hf->text + pos);
  1062. tk->pos_inc = ((pos != 0) ? 1 : 0);
  1063. tk->start = hf->start + pos;
  1064. tk->end = tk->start + text_len;
  1065. hf->pos += text_len + 1;
  1066. tk->len = text_len;
  1067. return tk;
  1068. }
  1069. else {
  1070. char *p;
  1071. bool seen_hyphen = false;
  1072. bool seen_other_punc = false;
  1073. hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
  1074. if (NULL == tk) return NULL;
  1075. p = tk->text + 1;
  1076. while (*p) {
  1077. if (*p == '-') {
  1078. seen_hyphen = true;
  1079. }
  1080. else if (!isalpha(*p)) {
  1081. seen_other_punc = true;
  1082. break;
  1083. }
  1084. p++;
  1085. }
  1086. if (seen_hyphen && !seen_other_punc) {
  1087. char *q = hf->text;
  1088. char *r = tk->text;
  1089. p = tk->text;
  1090. while (*p) {
  1091. if (*p == '-') {
  1092. *q = '\0';
  1093. }
  1094. else {
  1095. *r = *q = *p;
  1096. r++;
  1097. }
  1098. q++;
  1099. p++;
  1100. }
  1101. *r = *q = '\0';
  1102. hf->start = tk->start;
  1103. hf->pos = 0;
  1104. hf->len = q - hf->text;
  1105. tk->len = r - tk->text;
  1106. }
  1107. }
  1108. return tk;
  1109. }
  1110. TokenStream *hyphen_filter_new(TokenStream *sub_ts)
  1111. {
  1112. TokenStream *ts = tf_new(HyphenFilter, sub_ts);
  1113. ts->next = &hf_next;
  1114. ts->clone_i = &hf_clone_i;
  1115. return ts;
  1116. }
  1117. /****************************************************************************
  1118. * LowerCaseFilter
  1119. ****************************************************************************/
  1120. Token *mb_lcf_next(TokenStream *ts)
  1121. {
  1122. wchar_t wbuf[MAX_WORD_SIZE + 1], *wchr;
  1123. Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
  1124. int x;
  1125. wbuf[MAX_WORD_SIZE] = 0;
  1126. if (tk == NULL) {
  1127. return tk;
  1128. }
  1129. if ((x=mbstowcs(wbuf, tk->text, MAX_WORD_SIZE)) <= 0) return tk;
  1130. wchr = wbuf;
  1131. while (*wchr != 0) {
  1132. *wchr = towlower(*wchr);
  1133. wchr++;
  1134. }
  1135. tk->len = wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
  1136. if (tk->len <= 0) {
  1137. strcpy(tk->text, "BAD_DATA");
  1138. tk->len = 8;
  1139. }
  1140. tk->text[tk->len] = '\0';
  1141. return tk;
  1142. }
  1143. TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
  1144. {
  1145. TokenStream *ts = tf_new(TokenFilter, sub_ts);
  1146. ts->next = &mb_lcf_next;
  1147. return ts;
  1148. }
  1149. Token *lcf_next(TokenStream *ts)
  1150. {
  1151. int i = 0;
  1152. Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
  1153. if (tk == NULL) {
  1154. return tk;
  1155. }
  1156. while (tk->text[i] != '\0') {
  1157. tk->text[i] = tolower(tk->text[i]);
  1158. i++;
  1159. }
  1160. return tk;
  1161. }
  1162. TokenStream *lowercase_filter_new(TokenStream *sub_ts)
  1163. {
  1164. TokenStream *ts = tf_new(TokenFilter, sub_ts);
  1165. ts->next = &lcf_next;
  1166. return ts;
  1167. }
  1168. /****************************************************************************
  1169. * StemFilter
  1170. ****************************************************************************/
  1171. #define StemFilt(filter) ((StemFilter *)(filter))
  1172. void stemf_destroy_i(TokenStream *ts)
  1173. {
  1174. sb_stemmer_delete(StemFilt(ts)->stemmer);
  1175. free(StemFilt(ts)->algorithm);
  1176. free(StemFilt(ts)->charenc);
  1177. filter_destroy_i(ts);
  1178. }
  1179. Token *stemf_next(TokenStream *ts)
  1180. {
  1181. int len;
  1182. const sb_symbol *stemmed;
  1183. struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
  1184. TokenFilter *tf = TkFilt(ts);
  1185. Token *tk = tf->sub_ts->next(tf->sub_ts);
  1186. if (tk == NULL) {
  1187. return tk;
  1188. }
  1189. stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
  1190. len = sb_stemmer_length(stemmer);
  1191. if (len >= MAX_WORD_SIZE) {
  1192. len = MAX_WORD_SIZE - 1;
  1193. }
  1194. memcpy(tk->text, stemmed, len);
  1195. tk->text[len] = '\0';
  1196. tk->len = len;
  1197. return tk;
  1198. }
  1199. TokenStream *stemf_clone_i(TokenStream *orig_ts)
  1200. {
  1201. TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
  1202. StemFilter *stemf = StemFilt(new_ts);
  1203. StemFilter *orig_stemf = StemFilt(orig_ts);
  1204. stemf->stemmer =
  1205. sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
  1206. stemf->algorithm =
  1207. orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
  1208. stemf->charenc =
  1209. orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
  1210. return new_ts;
  1211. }
  1212. TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
  1213. const char *charenc)
  1214. {
  1215. TokenStream *tf = tf_new(StemFilter, ts);
  1216. StemFilt(tf)->stemmer = sb_stemmer_new(algorithm, charenc);
  1217. StemFilt(tf)->algorithm = algorithm ? estrdup(algorithm) : NULL;
  1218. StemFilt(tf)->charenc = charenc ? estrdup(charenc) : NULL;
  1219. tf->next = &stemf_next;
  1220. tf->destroy_i = &stemf_destroy_i;
  1221. tf->clone_i = &stemf_clone_i;
  1222. return tf;
  1223. }
  1224. /****************************************************************************
  1225. *
  1226. * Analyzers
  1227. *
  1228. ****************************************************************************/
  1229. /****************************************************************************
  1230. * Standard
  1231. ****************************************************************************/
  1232. Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
  1233. bool lowercase)
  1234. {
  1235. TokenStream *ts = standard_tokenizer_new();
  1236. if (lowercase) {
  1237. ts = lowercase_filter_new(ts);
  1238. }
  1239. ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
  1240. return analyzer_new(ts, NULL, NULL);
  1241. }
  1242. Analyzer *standard_analyzer_new_with_words(const char **words,
  1243. bool lowercase)
  1244. {
  1245. TokenStream *ts = standard_tokenizer_new();
  1246. if (lowercase) {
  1247. ts = lowercase_filter_new(ts);
  1248. }
  1249. ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
  1250. return analyzer_new(ts, NULL, NULL);
  1251. }
  1252. Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
  1253. int len, bool lowercase)
  1254. {
  1255. TokenStream *ts = mb_standard_tokenizer_new();
  1256. if (lowercase) {
  1257. ts = mb_lowercase_filter_new(ts);
  1258. }
  1259. ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
  1260. return analyzer_new(ts, NULL, NULL);
  1261. }
  1262. Analyzer *mb_standard_analyzer_new_with_words(const char **words,
  1263. bool lowercase)
  1264. {
  1265. TokenStream *ts = mb_standard_tokenizer_new();
  1266. if (lowercase) {
  1267. ts = mb_lowercase_filter_new(ts);
  1268. }
  1269. ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
  1270. return analyzer_new(ts, NULL, NULL);
  1271. }
  1272. Analyzer *standard_analyzer_new(bool lowercase)
  1273. {
  1274. return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
  1275. lowercase);
  1276. }
  1277. Analyzer *mb_standard_analyzer_new(bool lowercase)
  1278. {
  1279. return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
  1280. lowercase);
  1281. }
  1282. /****************************************************************************
  1283. *
  1284. * PerFieldAnalyzer
  1285. *
  1286. ****************************************************************************/
  1287. #define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
  1288. void pfa_destroy_i(Analyzer *self)
  1289. {
  1290. h_destroy(PFA(self)->dict);
  1291. a_deref(PFA(self)->default_a);
  1292. free(self);
  1293. }
  1294. TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
  1295. {
  1296. Analyzer *a = h_get(PFA(self)->dict, field);
  1297. if (a == NULL) {
  1298. a = PFA(self)->default_a;
  1299. }
  1300. return a_get_ts(a, field, text);
  1301. }
  1302. void pfa_sub_a_destroy_i(void *p)
  1303. {
  1304. Analyzer *a = (Analyzer *) p;
  1305. a_deref(a);
  1306. }
  1307. void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
  1308. {
  1309. h_set(PFA(self)->dict, estrdup(field), analyzer);
  1310. }
  1311. Analyzer *per_field_analyzer_new(Analyzer *default_a)
  1312. {
  1313. Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
  1314. PFA(a)->default_a = default_a;
  1315. PFA(a)->dict = h_new_str(&free, &pfa_sub_a_destroy_i);
  1316. a->destroy_i = &pfa_destroy_i;
  1317. a->get_ts = pfa_get_ts;
  1318. a->ref_cnt = 1;
  1319. return a;
  1320. }
  1321. #ifdef ALONE
  1322. int main(int argc, char **argv)
  1323. {
  1324. char buf[10000];
  1325. Analyzer *a = standard_analyzer_new(true);
  1326. TokenStream *ts;
  1327. Token *tk;
  1328. while (fgets(buf, 9999, stdin) != NULL) {
  1329. ts = a_get_ts(a, "hello", buf);
  1330. while ((tk = ts->next(ts)) != NULL) {
  1331. printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
  1332. }
  1333. printf("\n");
  1334. ts_deref(ts);
  1335. }
  1336. return 0;
  1337. }
  1338. #endif