PageRenderTime 55ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/gnu/usr.bin/grep/search.c

https://bitbucket.org/freebsd/freebsd-head/
C | 1289 lines | 1037 code | 115 blank | 137 comment | 311 complexity | e2875dc630ee4a4155b531a6f80e9fad MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, BSD-3-Clause, LGPL-2.0, LGPL-2.1, BSD-2-Clause, 0BSD, JSON, AGPL-1.0, GPL-2.0
  1. /* search.c - searching subroutines using dfa, kwset and regex for grep.
  2. Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2, or (at your option)
  6. any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  14. 02111-1307, USA. */
  15. /* Written August 1992 by Mike Haertel. */
  16. /* $FreeBSD$ */
  17. #ifndef _GNU_SOURCE
  18. # define _GNU_SOURCE 1
  19. #endif
  20. #ifdef HAVE_CONFIG_H
  21. # include <config.h>
  22. #endif
  23. #include <assert.h>
  24. #include <sys/types.h>
  25. #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
  26. /* We can handle multibyte string. */
  27. # define MBS_SUPPORT
  28. # include <wchar.h>
  29. # include <wctype.h>
  30. #endif
  31. #include "system.h"
  32. #include "grep.h"
  33. #include "regex.h"
  34. #include "dfa.h"
  35. #include "kwset.h"
  36. #include "error.h"
  37. #include "xalloc.h"
  38. #ifdef HAVE_LIBPCRE
  39. # include <pcre.h>
  40. #endif
  41. #ifdef HAVE_LANGINFO_CODESET
  42. # include <langinfo.h>
  43. #endif
  44. #define NCHAR (UCHAR_MAX + 1)
  45. /* For -w, we also consider _ to be word constituent. */
  46. #define WCHAR(C) (ISALNUM(C) || (C) == '_')
  47. /* DFA compiled regexp. */
  48. static struct dfa dfa;
  49. /* The Regex compiled patterns. */
  50. static struct patterns
  51. {
  52. /* Regex compiled regexp. */
  53. struct re_pattern_buffer regexbuf;
  54. struct re_registers regs; /* This is here on account of a BRAIN-DEAD
  55. Q@#%!# library interface in regex.c. */
  56. } patterns0;
  57. struct patterns *patterns;
  58. size_t pcount;
  59. /* KWset compiled pattern. For Ecompile and Gcompile, we compile
  60. a list of strings, at least one of which is known to occur in
  61. any string matching the regexp. */
  62. static kwset_t kwset;
  63. /* Number of compiled fixed strings known to exactly match the regexp.
  64. If kwsexec returns < kwset_exact_matches, then we don't need to
  65. call the regexp matcher at all. */
  66. static int kwset_exact_matches;
  67. /* UTF-8 encoding allows some optimizations that we can't otherwise
  68. assume in a multibyte encoding. */
  69. static int using_utf8;
  70. static void kwsinit PARAMS ((void));
  71. static void kwsmusts PARAMS ((void));
  72. static void Gcompile PARAMS ((char const *, size_t));
  73. static void Ecompile PARAMS ((char const *, size_t));
  74. static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
  75. static void Fcompile PARAMS ((char const *, size_t));
  76. static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
  77. static void Pcompile PARAMS ((char const *, size_t ));
  78. static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
  79. void
  80. check_utf8 (void)
  81. {
  82. #ifdef HAVE_LANGINFO_CODESET
  83. if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
  84. using_utf8 = 1;
  85. #endif
  86. }
  87. void
  88. dfaerror (char const *mesg)
  89. {
  90. error (2, 0, mesg);
  91. }
  92. static void
  93. kwsinit (void)
  94. {
  95. static char trans[NCHAR];
  96. int i;
  97. if (match_icase)
  98. for (i = 0; i < NCHAR; ++i)
  99. trans[i] = TOLOWER (i);
  100. if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
  101. error (2, 0, _("memory exhausted"));
  102. }
  103. /* If the DFA turns out to have some set of fixed strings one of
  104. which must occur in the match, then we build a kwset matcher
  105. to find those strings, and thus quickly filter out impossible
  106. matches. */
  107. static void
  108. kwsmusts (void)
  109. {
  110. struct dfamust const *dm;
  111. char const *err;
  112. if (dfa.musts)
  113. {
  114. kwsinit ();
  115. /* First, we compile in the substrings known to be exact
  116. matches. The kwset matcher will return the index
  117. of the matching string that it chooses. */
  118. for (dm = dfa.musts; dm; dm = dm->next)
  119. {
  120. if (!dm->exact)
  121. continue;
  122. ++kwset_exact_matches;
  123. if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
  124. error (2, 0, err);
  125. }
  126. /* Now, we compile the substrings that will require
  127. the use of the regexp matcher. */
  128. for (dm = dfa.musts; dm; dm = dm->next)
  129. {
  130. if (dm->exact)
  131. continue;
  132. if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
  133. error (2, 0, err);
  134. }
  135. if ((err = kwsprep (kwset)) != 0)
  136. error (2, 0, err);
  137. }
  138. }
  139. static void
  140. Gcompile (char const *pattern, size_t size)
  141. {
  142. const char *err;
  143. char const *sep;
  144. size_t total = size;
  145. char const *motif = pattern;
  146. check_utf8 ();
  147. re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
  148. dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
  149. /* For GNU regex compiler we have to pass the patterns separately to detect
  150. errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
  151. GNU regex should have raise a syntax error. The same for backref, where
  152. the backref should have been local to each pattern. */
  153. do
  154. {
  155. size_t len;
  156. sep = memchr (motif, '\n', total);
  157. if (sep)
  158. {
  159. len = sep - motif;
  160. sep++;
  161. total -= (len + 1);
  162. }
  163. else
  164. {
  165. len = total;
  166. total = 0;
  167. }
  168. patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
  169. if (patterns == NULL)
  170. error (2, errno, _("memory exhausted"));
  171. patterns[pcount] = patterns0;
  172. if ((err = re_compile_pattern (motif, len,
  173. &(patterns[pcount].regexbuf))) != 0)
  174. error (2, 0, err);
  175. pcount++;
  176. motif = sep;
  177. } while (sep && total != 0);
  178. /* In the match_words and match_lines cases, we use a different pattern
  179. for the DFA matcher that will quickly throw out cases that won't work.
  180. Then if DFA succeeds we do some hairy stuff using the regex matcher
  181. to decide whether the match should really count. */
  182. if (match_words || match_lines)
  183. {
  184. /* In the whole-word case, we use the pattern:
  185. \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
  186. In the whole-line case, we use the pattern:
  187. ^\(userpattern\)$. */
  188. static char const line_beg[] = "^\\(";
  189. static char const line_end[] = "\\)$";
  190. static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
  191. static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
  192. char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
  193. size_t i;
  194. strcpy (n, match_lines ? line_beg : word_beg);
  195. i = strlen (n);
  196. memcpy (n + i, pattern, size);
  197. i += size;
  198. strcpy (n + i, match_lines ? line_end : word_end);
  199. i += strlen (n + i);
  200. pattern = n;
  201. size = i;
  202. }
  203. dfacomp (pattern, size, &dfa, 1);
  204. kwsmusts ();
  205. }
  206. static void
  207. Ecompile (char const *pattern, size_t size)
  208. {
  209. const char *err;
  210. const char *sep;
  211. size_t total = size;
  212. char const *motif = pattern;
  213. check_utf8 ();
  214. if (strcmp (matcher, "awk") == 0)
  215. {
  216. re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
  217. dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
  218. }
  219. else
  220. {
  221. re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
  222. dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
  223. }
  224. /* For GNU regex compiler we have to pass the patterns separately to detect
  225. errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
  226. GNU regex should have raise a syntax error. The same for backref, where
  227. the backref should have been local to each pattern. */
  228. do
  229. {
  230. size_t len;
  231. sep = memchr (motif, '\n', total);
  232. if (sep)
  233. {
  234. len = sep - motif;
  235. sep++;
  236. total -= (len + 1);
  237. }
  238. else
  239. {
  240. len = total;
  241. total = 0;
  242. }
  243. patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
  244. if (patterns == NULL)
  245. error (2, errno, _("memory exhausted"));
  246. patterns[pcount] = patterns0;
  247. if ((err = re_compile_pattern (motif, len,
  248. &(patterns[pcount].regexbuf))) != 0)
  249. error (2, 0, err);
  250. pcount++;
  251. motif = sep;
  252. } while (sep && total != 0);
  253. /* In the match_words and match_lines cases, we use a different pattern
  254. for the DFA matcher that will quickly throw out cases that won't work.
  255. Then if DFA succeeds we do some hairy stuff using the regex matcher
  256. to decide whether the match should really count. */
  257. if (match_words || match_lines)
  258. {
  259. /* In the whole-word case, we use the pattern:
  260. (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
  261. In the whole-line case, we use the pattern:
  262. ^(userpattern)$. */
  263. static char const line_beg[] = "^(";
  264. static char const line_end[] = ")$";
  265. static char const word_beg[] = "(^|[^[:alnum:]_])(";
  266. static char const word_end[] = ")([^[:alnum:]_]|$)";
  267. char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
  268. size_t i;
  269. strcpy (n, match_lines ? line_beg : word_beg);
  270. i = strlen(n);
  271. memcpy (n + i, pattern, size);
  272. i += size;
  273. strcpy (n + i, match_lines ? line_end : word_end);
  274. i += strlen (n + i);
  275. pattern = n;
  276. size = i;
  277. }
  278. dfacomp (pattern, size, &dfa, 1);
  279. kwsmusts ();
  280. }
  281. static size_t
  282. EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
  283. {
  284. register char const *buflim, *beg, *end;
  285. char eol = eolbyte;
  286. int backref, start, len;
  287. struct kwsmatch kwsm;
  288. size_t i, ret_val;
  289. static int use_dfa;
  290. static int use_dfa_checked = 0;
  291. #ifdef MBS_SUPPORT
  292. const char *last_char = NULL;
  293. int mb_cur_max = MB_CUR_MAX;
  294. mbstate_t mbs;
  295. memset (&mbs, '\0', sizeof (mbstate_t));
  296. #endif /* MBS_SUPPORT */
  297. if (!use_dfa_checked)
  298. {
  299. char *grep_use_dfa = getenv ("GREP_USE_DFA");
  300. if (!grep_use_dfa)
  301. {
  302. #ifdef MBS_SUPPORT
  303. /* Turn off DFA when processing multibyte input. */
  304. use_dfa = (MB_CUR_MAX == 1);
  305. #else
  306. use_dfa = 1;
  307. #endif /* MBS_SUPPORT */
  308. }
  309. else
  310. {
  311. use_dfa = atoi (grep_use_dfa);
  312. }
  313. use_dfa_checked = 1;
  314. }
  315. buflim = buf + size;
  316. for (beg = end = buf; end < buflim; beg = end)
  317. {
  318. if (!exact)
  319. {
  320. if (kwset)
  321. {
  322. /* Find a possible match using the KWset matcher. */
  323. #ifdef MBS_SUPPORT
  324. size_t bytes_left = 0;
  325. #endif /* MBS_SUPPORT */
  326. size_t offset;
  327. #ifdef MBS_SUPPORT
  328. /* kwsexec doesn't work with match_icase and multibyte input. */
  329. if (match_icase && mb_cur_max > 1)
  330. /* Avoid kwset */
  331. offset = 0;
  332. else
  333. #endif /* MBS_SUPPORT */
  334. offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
  335. if (offset == (size_t) -1)
  336. goto failure;
  337. #ifdef MBS_SUPPORT
  338. if (mb_cur_max > 1 && !using_utf8)
  339. {
  340. bytes_left = offset;
  341. while (bytes_left)
  342. {
  343. size_t mlen = mbrlen (beg, bytes_left, &mbs);
  344. last_char = beg;
  345. if (mlen == (size_t) -1 || mlen == 0)
  346. {
  347. /* Incomplete character: treat as single-byte. */
  348. memset (&mbs, '\0', sizeof (mbstate_t));
  349. beg++;
  350. bytes_left--;
  351. continue;
  352. }
  353. if (mlen == (size_t) -2)
  354. /* Offset points inside multibyte character:
  355. * no good. */
  356. break;
  357. beg += mlen;
  358. bytes_left -= mlen;
  359. }
  360. }
  361. else
  362. #endif /* MBS_SUPPORT */
  363. beg += offset;
  364. /* Narrow down to the line containing the candidate, and
  365. run it through DFA. */
  366. end = memchr(beg, eol, buflim - beg);
  367. end++;
  368. #ifdef MBS_SUPPORT
  369. if (mb_cur_max > 1 && bytes_left)
  370. continue;
  371. #endif /* MBS_SUPPORT */
  372. while (beg > buf && beg[-1] != eol)
  373. --beg;
  374. if (
  375. #ifdef MBS_SUPPORT
  376. !(match_icase && mb_cur_max > 1) &&
  377. #endif /* MBS_SUPPORT */
  378. (kwsm.index < kwset_exact_matches))
  379. goto success_in_beg_and_end;
  380. if (use_dfa &&
  381. dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
  382. continue;
  383. }
  384. else
  385. {
  386. /* No good fixed strings; start with DFA. */
  387. #ifdef MBS_SUPPORT
  388. size_t bytes_left = 0;
  389. #endif /* MBS_SUPPORT */
  390. size_t offset = 0;
  391. if (use_dfa)
  392. offset = dfaexec (&dfa, beg, buflim - beg, &backref);
  393. if (offset == (size_t) -1)
  394. break;
  395. /* Narrow down to the line we've found. */
  396. #ifdef MBS_SUPPORT
  397. if (mb_cur_max > 1 && !using_utf8)
  398. {
  399. bytes_left = offset;
  400. while (bytes_left)
  401. {
  402. size_t mlen = mbrlen (beg, bytes_left, &mbs);
  403. last_char = beg;
  404. if (mlen == (size_t) -1 || mlen == 0)
  405. {
  406. /* Incomplete character: treat as single-byte. */
  407. memset (&mbs, '\0', sizeof (mbstate_t));
  408. beg++;
  409. bytes_left--;
  410. continue;
  411. }
  412. if (mlen == (size_t) -2)
  413. /* Offset points inside multibyte character:
  414. * no good. */
  415. break;
  416. beg += mlen;
  417. bytes_left -= mlen;
  418. }
  419. }
  420. else
  421. #endif /* MBS_SUPPORT */
  422. beg += offset;
  423. end = memchr (beg, eol, buflim - beg);
  424. end++;
  425. #ifdef MBS_SUPPORT
  426. if (mb_cur_max > 1 && bytes_left)
  427. continue;
  428. #endif /* MBS_SUPPORT */
  429. while (beg > buf && beg[-1] != eol)
  430. --beg;
  431. }
  432. /* Successful, no backreferences encountered! */
  433. if (use_dfa && !backref)
  434. goto success_in_beg_and_end;
  435. }
  436. else
  437. end = beg + size;
  438. /* If we've made it to this point, this means DFA has seen
  439. a probable match, and we need to run it through Regex. */
  440. for (i = 0; i < pcount; i++)
  441. {
  442. patterns[i].regexbuf.not_eol = 0;
  443. if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
  444. end - beg - 1, 0,
  445. end - beg - 1, &(patterns[i].regs))))
  446. {
  447. len = patterns[i].regs.end[0] - start;
  448. if (exact && !match_words)
  449. goto success_in_start_and_len;
  450. if ((!match_lines && !match_words)
  451. || (match_lines && len == end - beg - 1))
  452. goto success_in_beg_and_end;
  453. /* If -w, check if the match aligns with word boundaries.
  454. We do this iteratively because:
  455. (a) the line may contain more than one occurence of the
  456. pattern, and
  457. (b) Several alternatives in the pattern might be valid at a
  458. given point, and we may need to consider a shorter one to
  459. find a word boundary. */
  460. if (match_words)
  461. while (start >= 0)
  462. {
  463. int lword_match = 0;
  464. if (start == 0)
  465. lword_match = 1;
  466. else
  467. {
  468. assert (start > 0);
  469. #ifdef MBS_SUPPORT
  470. if (mb_cur_max > 1)
  471. {
  472. const char *s;
  473. size_t mr;
  474. wchar_t pwc;
  475. /* Locate the start of the multibyte character
  476. before the match position (== beg + start). */
  477. if (using_utf8)
  478. {
  479. /* UTF-8 is a special case: scan backwards
  480. until we find a 7-bit character or a
  481. lead byte. */
  482. s = beg + start - 1;
  483. while (s > buf
  484. && (unsigned char) *s >= 0x80
  485. && (unsigned char) *s <= 0xbf)
  486. --s;
  487. }
  488. else
  489. {
  490. /* Scan forwards to find the start of the
  491. last complete character before the
  492. match position. */
  493. size_t bytes_left = start - 1;
  494. s = beg;
  495. while (bytes_left > 0)
  496. {
  497. mr = mbrlen (s, bytes_left, &mbs);
  498. if (mr == (size_t) -1 || mr == 0)
  499. {
  500. memset (&mbs, '\0', sizeof (mbs));
  501. s++;
  502. bytes_left--;
  503. continue;
  504. }
  505. if (mr == (size_t) -2)
  506. {
  507. memset (&mbs, '\0', sizeof (mbs));
  508. break;
  509. }
  510. s += mr;
  511. bytes_left -= mr;
  512. }
  513. }
  514. mr = mbrtowc (&pwc, s, beg + start - s, &mbs);
  515. if (mr == (size_t) -2 || mr == (size_t) -1 ||
  516. mr == 0)
  517. {
  518. memset (&mbs, '\0', sizeof (mbstate_t));
  519. lword_match = 1;
  520. }
  521. else if (!(iswalnum (pwc) || pwc == L'_')
  522. && mr == beg + start - s)
  523. lword_match = 1;
  524. }
  525. else
  526. #endif /* MBS_SUPPORT */
  527. if (!WCHAR ((unsigned char) beg[start - 1]))
  528. lword_match = 1;
  529. }
  530. if (lword_match)
  531. {
  532. int rword_match = 0;
  533. if (start + len == end - beg - 1)
  534. rword_match = 1;
  535. else
  536. {
  537. #ifdef MBS_SUPPORT
  538. if (mb_cur_max > 1)
  539. {
  540. wchar_t nwc;
  541. int mr;
  542. mr = mbtowc (&nwc, beg + start + len,
  543. end - beg - start - len - 1);
  544. if (mr <= 0)
  545. {
  546. memset (&mbs, '\0', sizeof (mbstate_t));
  547. rword_match = 1;
  548. }
  549. else if (!iswalnum (nwc) && nwc != L'_')
  550. rword_match = 1;
  551. }
  552. else
  553. #endif /* MBS_SUPPORT */
  554. if (!WCHAR ((unsigned char) beg[start + len]))
  555. rword_match = 1;
  556. }
  557. if (rword_match)
  558. {
  559. if (!exact)
  560. /* Returns the whole line. */
  561. goto success_in_beg_and_end;
  562. else
  563. /* Returns just this word match. */
  564. goto success_in_start_and_len;
  565. }
  566. }
  567. if (len > 0)
  568. {
  569. /* Try a shorter length anchored at the same place. */
  570. --len;
  571. patterns[i].regexbuf.not_eol = 1;
  572. len = re_match (&(patterns[i].regexbuf), beg,
  573. start + len, start,
  574. &(patterns[i].regs));
  575. }
  576. if (len <= 0)
  577. {
  578. /* Try looking further on. */
  579. if (start == end - beg - 1)
  580. break;
  581. ++start;
  582. patterns[i].regexbuf.not_eol = 0;
  583. start = re_search (&(patterns[i].regexbuf), beg,
  584. end - beg - 1,
  585. start, end - beg - 1 - start,
  586. &(patterns[i].regs));
  587. len = patterns[i].regs.end[0] - start;
  588. }
  589. }
  590. }
  591. } /* for Regex patterns. */
  592. } /* for (beg = end ..) */
  593. failure:
  594. return (size_t) -1;
  595. success_in_beg_and_end:
  596. len = end - beg;
  597. start = beg - buf;
  598. /* FALLTHROUGH */
  599. success_in_start_and_len:
  600. *match_size = len;
  601. return start;
  602. }
  603. #ifdef MBS_SUPPORT
  604. static int f_i_multibyte; /* whether we're using the new -Fi MB method */
  605. static struct
  606. {
  607. wchar_t **patterns;
  608. size_t count, maxlen;
  609. unsigned char *match;
  610. } Fimb;
  611. #endif
  612. static void
  613. Fcompile (char const *pattern, size_t size)
  614. {
  615. int mb_cur_max = MB_CUR_MAX;
  616. char const *beg, *lim, *err;
  617. check_utf8 ();
  618. #ifdef MBS_SUPPORT
  619. /* Support -F -i for UTF-8 input. */
  620. if (match_icase && mb_cur_max > 1)
  621. {
  622. mbstate_t mbs;
  623. wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
  624. const char *patternend = pattern;
  625. size_t wcsize;
  626. kwset_t fimb_kwset = NULL;
  627. char *starts = NULL;
  628. wchar_t *wcbeg, *wclim;
  629. size_t allocated = 0;
  630. memset (&mbs, '\0', sizeof (mbs));
  631. # ifdef __GNU_LIBRARY__
  632. wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
  633. if (patternend != pattern + size)
  634. wcsize = (size_t) -1;
  635. # else
  636. {
  637. char *patterncopy = xmalloc (size + 1);
  638. memcpy (patterncopy, pattern, size);
  639. patterncopy[size] = '\0';
  640. patternend = patterncopy;
  641. wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
  642. if (patternend != patterncopy + size)
  643. wcsize = (size_t) -1;
  644. free (patterncopy);
  645. }
  646. # endif
  647. if (wcsize + 2 <= 2)
  648. {
  649. fimb_fail:
  650. free (wcpattern);
  651. free (starts);
  652. if (fimb_kwset)
  653. kwsfree (fimb_kwset);
  654. free (Fimb.patterns);
  655. Fimb.patterns = NULL;
  656. }
  657. else
  658. {
  659. if (!(fimb_kwset = kwsalloc (NULL)))
  660. error (2, 0, _("memory exhausted"));
  661. starts = xmalloc (mb_cur_max * 3);
  662. wcbeg = wcpattern;
  663. do
  664. {
  665. int i;
  666. size_t wclen;
  667. if (Fimb.count >= allocated)
  668. {
  669. if (allocated == 0)
  670. allocated = 128;
  671. else
  672. allocated *= 2;
  673. Fimb.patterns = xrealloc (Fimb.patterns,
  674. sizeof (wchar_t *) * allocated);
  675. }
  676. Fimb.patterns[Fimb.count++] = wcbeg;
  677. for (wclim = wcbeg;
  678. wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
  679. *wclim = towlower (*wclim);
  680. *wclim = L'\0';
  681. wclen = wclim - wcbeg;
  682. if (wclen > Fimb.maxlen)
  683. Fimb.maxlen = wclen;
  684. if (wclen > 3)
  685. wclen = 3;
  686. if (wclen == 0)
  687. {
  688. if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
  689. error (2, 0, err);
  690. }
  691. else
  692. for (i = 0; i < (1 << wclen); i++)
  693. {
  694. char *p = starts;
  695. int j, k;
  696. for (j = 0; j < wclen; ++j)
  697. {
  698. wchar_t wc = wcbeg[j];
  699. if (i & (1 << j))
  700. {
  701. wc = towupper (wc);
  702. if (wc == wcbeg[j])
  703. continue;
  704. }
  705. k = wctomb (p, wc);
  706. if (k <= 0)
  707. goto fimb_fail;
  708. p += k;
  709. }
  710. if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
  711. error (2, 0, err);
  712. }
  713. if (wclim < wcpattern + wcsize)
  714. ++wclim;
  715. wcbeg = wclim;
  716. }
  717. while (wcbeg < wcpattern + wcsize);
  718. f_i_multibyte = 1;
  719. kwset = fimb_kwset;
  720. free (starts);
  721. Fimb.match = xmalloc (Fimb.count);
  722. if ((err = kwsprep (kwset)) != 0)
  723. error (2, 0, err);
  724. return;
  725. }
  726. }
  727. #endif /* MBS_SUPPORT */
  728. kwsinit ();
  729. beg = pattern;
  730. do
  731. {
  732. for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
  733. ;
  734. if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
  735. error (2, 0, err);
  736. if (lim < pattern + size)
  737. ++lim;
  738. beg = lim;
  739. }
  740. while (beg < pattern + size);
  741. if ((err = kwsprep (kwset)) != 0)
  742. error (2, 0, err);
  743. }
  744. #ifdef MBS_SUPPORT
  745. static int
  746. Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
  747. {
  748. size_t len, letter, i;
  749. int ret = -1;
  750. mbstate_t mbs;
  751. wchar_t wc;
  752. int patterns_left;
  753. assert (match_icase && f_i_multibyte == 1);
  754. assert (MB_CUR_MAX > 1);
  755. memset (&mbs, '\0', sizeof (mbs));
  756. memset (Fimb.match, '\1', Fimb.count);
  757. letter = len = 0;
  758. patterns_left = 1;
  759. while (patterns_left && len <= size)
  760. {
  761. size_t c;
  762. patterns_left = 0;
  763. if (len < size)
  764. {
  765. c = mbrtowc (&wc, buf + len, size - len, &mbs);
  766. if (c + 2 <= 2)
  767. return ret;
  768. wc = towlower (wc);
  769. }
  770. else
  771. {
  772. c = 1;
  773. wc = L'\0';
  774. }
  775. for (i = 0; i < Fimb.count; i++)
  776. {
  777. if (Fimb.match[i])
  778. {
  779. if (Fimb.patterns[i][letter] == L'\0')
  780. {
  781. /* Found a match. */
  782. *plen = len;
  783. if (!exact && !match_words)
  784. return 0;
  785. else
  786. {
  787. /* For -w or exact look for longest match. */
  788. ret = 0;
  789. Fimb.match[i] = '\0';
  790. continue;
  791. }
  792. }
  793. if (Fimb.patterns[i][letter] == wc)
  794. patterns_left = 1;
  795. else
  796. Fimb.match[i] = '\0';
  797. }
  798. }
  799. len += c;
  800. letter++;
  801. }
  802. return ret;
  803. }
  804. #endif /* MBS_SUPPORT */
  805. static size_t
  806. Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
  807. {
  808. register char const *beg, *try, *end;
  809. register size_t len;
  810. char eol = eolbyte;
  811. struct kwsmatch kwsmatch;
  812. size_t ret_val;
  813. #ifdef MBS_SUPPORT
  814. int mb_cur_max = MB_CUR_MAX;
  815. mbstate_t mbs;
  816. memset (&mbs, '\0', sizeof (mbstate_t));
  817. const char *last_char = NULL;
  818. #endif /* MBS_SUPPORT */
  819. for (beg = buf; beg <= buf + size; ++beg)
  820. {
  821. size_t offset;
  822. offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
  823. if (offset == (size_t) -1)
  824. goto failure;
  825. #ifdef MBS_SUPPORT
  826. if (mb_cur_max > 1 && !using_utf8)
  827. {
  828. size_t bytes_left = offset;
  829. while (bytes_left)
  830. {
  831. size_t mlen = mbrlen (beg, bytes_left, &mbs);
  832. last_char = beg;
  833. if (mlen == (size_t) -1 || mlen == 0)
  834. {
  835. /* Incomplete character: treat as single-byte. */
  836. memset (&mbs, '\0', sizeof (mbstate_t));
  837. beg++;
  838. bytes_left--;
  839. continue;
  840. }
  841. if (mlen == (size_t) -2)
  842. /* Offset points inside multibyte character: no good. */
  843. break;
  844. beg += mlen;
  845. bytes_left -= mlen;
  846. }
  847. if (bytes_left)
  848. continue;
  849. }
  850. else
  851. #endif /* MBS_SUPPORT */
  852. beg += offset;
  853. #ifdef MBS_SUPPORT
  854. /* For f_i_multibyte, the string at beg now matches first 3 chars of
  855. one of the search strings (less if there are shorter search strings).
  856. See if this is a real match. */
  857. if (f_i_multibyte
  858. && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
  859. goto next_char;
  860. #endif /* MBS_SUPPORT */
  861. len = kwsmatch.size[0];
  862. if (exact && !match_words)
  863. goto success_in_beg_and_len;
  864. if (match_lines)
  865. {
  866. if (beg > buf && beg[-1] != eol)
  867. goto next_char;
  868. if (beg + len < buf + size && beg[len] != eol)
  869. goto next_char;
  870. goto success;
  871. }
  872. else if (match_words)
  873. {
  874. while (1)
  875. {
  876. int word_match = 0;
  877. if (beg > buf)
  878. {
  879. #ifdef MBS_SUPPORT
  880. if (mb_cur_max > 1)
  881. {
  882. const char *s;
  883. int mr;
  884. wchar_t pwc;
  885. if (using_utf8)
  886. {
  887. s = beg - 1;
  888. while (s > buf
  889. && (unsigned char) *s >= 0x80
  890. && (unsigned char) *s <= 0xbf)
  891. --s;
  892. }
  893. else
  894. s = last_char;
  895. mr = mbtowc (&pwc, s, beg - s);
  896. if (mr <= 0)
  897. memset (&mbs, '\0', sizeof (mbstate_t));
  898. else if ((iswalnum (pwc) || pwc == L'_')
  899. && mr == (int) (beg - s))
  900. goto next_char;
  901. }
  902. else
  903. #endif /* MBS_SUPPORT */
  904. if (WCHAR ((unsigned char) beg[-1]))
  905. goto next_char;
  906. }
  907. #ifdef MBS_SUPPORT
  908. if (mb_cur_max > 1)
  909. {
  910. wchar_t nwc;
  911. int mr;
  912. mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
  913. if (mr <= 0)
  914. {
  915. memset (&mbs, '\0', sizeof (mbstate_t));
  916. word_match = 1;
  917. }
  918. else if (!iswalnum (nwc) && nwc != L'_')
  919. word_match = 1;
  920. }
  921. else
  922. #endif /* MBS_SUPPORT */
  923. if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
  924. word_match = 1;
  925. if (word_match)
  926. {
  927. if (!exact)
  928. /* Returns the whole line now we know there's a word match. */
  929. goto success;
  930. else
  931. /* Returns just this word match. */
  932. goto success_in_beg_and_len;
  933. }
  934. if (len > 0)
  935. {
  936. /* Try a shorter length anchored at the same place. */
  937. --len;
  938. offset = kwsexec (kwset, beg, len, &kwsmatch);
  939. if (offset == -1)
  940. goto next_char; /* Try a different anchor. */
  941. #ifdef MBS_SUPPORT
  942. if (mb_cur_max > 1 && !using_utf8)
  943. {
  944. size_t bytes_left = offset;
  945. while (bytes_left)
  946. {
  947. size_t mlen = mbrlen (beg, bytes_left, &mbs);
  948. last_char = beg;
  949. if (mlen == (size_t) -1 || mlen == 0)
  950. {
  951. /* Incomplete character: treat as single-byte. */
  952. memset (&mbs, '\0', sizeof (mbstate_t));
  953. beg++;
  954. bytes_left--;
  955. continue;
  956. }
  957. if (mlen == (size_t) -2)
  958. {
  959. /* Offset points inside multibyte character:
  960. * no good. */
  961. break;
  962. }
  963. beg += mlen;
  964. bytes_left -= mlen;
  965. }
  966. if (bytes_left)
  967. {
  968. memset (&mbs, '\0', sizeof (mbstate_t));
  969. goto next_char; /* Try a different anchor. */
  970. }
  971. }
  972. else
  973. #endif /* MBS_SUPPORT */
  974. beg += offset;
  975. #ifdef MBS_SUPPORT
  976. /* The string at beg now matches first 3 chars of one of
  977. the search strings (less if there are shorter search
  978. strings). See if this is a real match. */
  979. if (f_i_multibyte
  980. && Fimbexec (beg, len - offset, &kwsmatch.size[0],
  981. exact))
  982. goto next_char;
  983. #endif /* MBS_SUPPORT */
  984. len = kwsmatch.size[0];
  985. }
  986. }
  987. }
  988. else
  989. goto success;
  990. next_char:;
  991. #ifdef MBS_SUPPORT
  992. /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
  993. by ++beg above. */
  994. if (mb_cur_max > 1)
  995. {
  996. if (using_utf8)
  997. {
  998. unsigned char c = *beg;
  999. if (c >= 0xc2)
  1000. {
  1001. if (c < 0xe0)
  1002. ++beg;
  1003. else if (c < 0xf0)
  1004. beg += 2;
  1005. else if (c < 0xf8)
  1006. beg += 3;
  1007. else if (c < 0xfc)
  1008. beg += 4;
  1009. else if (c < 0xfe)
  1010. beg += 5;
  1011. }
  1012. }
  1013. else
  1014. {
  1015. size_t l = mbrlen (beg, buf + size - beg, &mbs);
  1016. last_char = beg;
  1017. if (l + 2 >= 2)
  1018. beg += l - 1;
  1019. else
  1020. memset (&mbs, '\0', sizeof (mbstate_t));
  1021. }
  1022. }
  1023. #endif /* MBS_SUPPORT */
  1024. }
  1025. failure:
  1026. return -1;
  1027. success:
  1028. #ifdef MBS_SUPPORT
  1029. if (mb_cur_max > 1 && !using_utf8)
  1030. {
  1031. end = beg + len;
  1032. while (end < buf + size)
  1033. {
  1034. size_t mlen = mbrlen (end, buf + size - end, &mbs);
  1035. if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
  1036. {
  1037. memset (&mbs, '\0', sizeof (mbstate_t));
  1038. mlen = 1;
  1039. }
  1040. if (mlen == 1 && *end == eol)
  1041. break;
  1042. end += mlen;
  1043. }
  1044. }
  1045. else
  1046. #endif /* MBS_SUPPORT */
  1047. end = memchr (beg + len, eol, (buf + size) - (beg + len));
  1048. end++;
  1049. while (buf < beg && beg[-1] != eol)
  1050. --beg;
  1051. len = end - beg;
  1052. /* FALLTHROUGH */
  1053. success_in_beg_and_len:
  1054. *match_size = len;
  1055. return beg - buf;
  1056. }
  1057. #if HAVE_LIBPCRE
  1058. /* Compiled internal form of a Perl regular expression. */
  1059. static pcre *cre;
  1060. /* Additional information about the pattern. */
  1061. static pcre_extra *extra;
  1062. #endif
  1063. static void
  1064. Pcompile (char const *pattern, size_t size)
  1065. {
  1066. #if !HAVE_LIBPCRE
  1067. error (2, 0, _("The -P option is not supported"));
  1068. #else
  1069. int e;
  1070. char const *ep;
  1071. char *re = xmalloc (4 * size + 7);
  1072. int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
  1073. char const *patlim = pattern + size;
  1074. char *n = re;
  1075. char const *p;
  1076. char const *pnul;
  1077. /* FIXME: Remove this restriction. */
  1078. if (eolbyte != '\n')
  1079. error (2, 0, _("The -P and -z options cannot be combined"));
  1080. *n = '\0';
  1081. if (match_lines)
  1082. strcpy (n, "^(");
  1083. if (match_words)
  1084. strcpy (n, "\\b(");
  1085. n += strlen (n);
  1086. /* The PCRE interface doesn't allow NUL bytes in the pattern, so
  1087. replace each NUL byte in the pattern with the four characters
  1088. "\000", removing a preceding backslash if there are an odd
  1089. number of backslashes before the NUL.
  1090. FIXME: This method does not work with some multibyte character
  1091. encodings, notably Shift-JIS, where a multibyte character can end
  1092. in a backslash byte. */
  1093. for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
  1094. {
  1095. memcpy (n, p, pnul - p);
  1096. n += pnul - p;
  1097. for (p = pnul; pattern < p && p[-1] == '\\'; p--)
  1098. continue;
  1099. n -= (pnul - p) & 1;
  1100. strcpy (n, "\\000");
  1101. n += 4;
  1102. }
  1103. memcpy (n, p, patlim - p);
  1104. n += patlim - p;
  1105. *n = '\0';
  1106. if (match_words)
  1107. strcpy (n, ")\\b");
  1108. if (match_lines)
  1109. strcpy (n, ")$");
  1110. cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
  1111. if (!cre)
  1112. error (2, 0, ep);
  1113. extra = pcre_study (cre, 0, &ep);
  1114. if (ep)
  1115. error (2, 0, ep);
  1116. free (re);
  1117. #endif
  1118. }
  1119. static size_t
  1120. Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
  1121. {
  1122. #if !HAVE_LIBPCRE
  1123. abort ();
  1124. return -1;
  1125. #else
  1126. /* This array must have at least two elements; everything after that
  1127. is just for performance improvement in pcre_exec. */
  1128. int sub[300];
  1129. int e = pcre_exec (cre, extra, buf, size, 0, 0,
  1130. sub, sizeof sub / sizeof *sub);
  1131. if (e <= 0)
  1132. {
  1133. switch (e)
  1134. {
  1135. case PCRE_ERROR_NOMATCH:
  1136. return -1;
  1137. case PCRE_ERROR_NOMEMORY:
  1138. error (2, 0, _("Memory exhausted"));
  1139. default:
  1140. abort ();
  1141. }
  1142. }
  1143. else
  1144. {
  1145. /* Narrow down to the line we've found. */
  1146. char const *beg = buf + sub[0];
  1147. char const *end = buf + sub[1];
  1148. char const *buflim = buf + size;
  1149. char eol = eolbyte;
  1150. if (!exact)
  1151. {
  1152. end = memchr (end, eol, buflim - end);
  1153. end++;
  1154. while (buf < beg && beg[-1] != eol)
  1155. --beg;
  1156. }
  1157. *match_size = end - beg;
  1158. return beg - buf;
  1159. }
  1160. #endif
  1161. }
  1162. struct matcher const matchers[] = {
  1163. { "default", Gcompile, EGexecute },
  1164. { "grep", Gcompile, EGexecute },
  1165. { "egrep", Ecompile, EGexecute },
  1166. { "awk", Ecompile, EGexecute },
  1167. { "fgrep", Fcompile, Fexecute },
  1168. { "perl", Pcompile, Pexecute },
  1169. { "", 0, 0 },
  1170. };