PageRenderTime 38ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 1ms

/grep-src/lib/mbsstr.c

https://gitlab.com/c9-mirror/node-gnu-tools
C | 382 lines | 225 code | 30 blank | 127 comment | 58 complexity | ab6b377a7af04d8e9c9ee6aafc328788 MD5 | raw file
  1. /* Searching in a string.
  2. Copyright (C) 2005-2012 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2005.
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include <string.h>
  17. #include <stdbool.h>
  18. #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */
  19. #include "malloca.h"
  20. #include "mbuiter.h"
  21. /* Knuth-Morris-Pratt algorithm. */
  22. #define UNIT unsigned char
  23. #define CANON_ELEMENT(c) c
  24. #include "str-kmp.h"
  25. /* Knuth-Morris-Pratt algorithm.
  26. See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
  27. Return a boolean indicating success:
  28. Return true and set *RESULTP if the search was completed.
  29. Return false if it was aborted because not enough memory was available. */
  30. static bool
  31. knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
  32. const char **resultp)
  33. {
  34. size_t m = mbslen (needle);
  35. mbchar_t *needle_mbchars;
  36. size_t *table;
  37. /* Allocate room for needle_mbchars and the table. */
  38. char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
  39. if (memory == NULL)
  40. return false;
  41. needle_mbchars = (mbchar_t *) memory;
  42. table = (size_t *) (memory + m * sizeof (mbchar_t));
  43. /* Fill needle_mbchars. */
  44. {
  45. mbui_iterator_t iter;
  46. size_t j;
  47. j = 0;
  48. for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
  49. mb_copy (&needle_mbchars[j], &mbui_cur (iter));
  50. }
  51. /* Fill the table.
  52. For 0 < i < m:
  53. 0 < table[i] <= i is defined such that
  54. forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
  55. and table[i] is as large as possible with this property.
  56. This implies:
  57. 1) For 0 < i < m:
  58. If table[i] < i,
  59. needle[table[i]..i-1] = needle[0..i-1-table[i]].
  60. 2) For 0 < i < m:
  61. rhaystack[0..i-1] == needle[0..i-1]
  62. and exists h, i <= h < m: rhaystack[h] != needle[h]
  63. implies
  64. forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
  65. table[0] remains uninitialized. */
  66. {
  67. size_t i, j;
  68. /* i = 1: Nothing to verify for x = 0. */
  69. table[1] = 1;
  70. j = 0;
  71. for (i = 2; i < m; i++)
  72. {
  73. /* Here: j = i-1 - table[i-1].
  74. The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
  75. for x < table[i-1], by induction.
  76. Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
  77. mbchar_t *b = &needle_mbchars[i - 1];
  78. for (;;)
  79. {
  80. /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
  81. is known to hold for x < i-1-j.
  82. Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
  83. if (mb_equal (*b, needle_mbchars[j]))
  84. {
  85. /* Set table[i] := i-1-j. */
  86. table[i] = i - ++j;
  87. break;
  88. }
  89. /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
  90. for x = i-1-j, because
  91. needle[i-1] != needle[j] = needle[i-1-x]. */
  92. if (j == 0)
  93. {
  94. /* The inequality holds for all possible x. */
  95. table[i] = i;
  96. break;
  97. }
  98. /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
  99. for i-1-j < x < i-1-j+table[j], because for these x:
  100. needle[x..i-2]
  101. = needle[x-(i-1-j)..j-1]
  102. != needle[0..j-1-(x-(i-1-j))] (by definition of table[j])
  103. = needle[0..i-2-x],
  104. hence needle[x..i-1] != needle[0..i-1-x].
  105. Furthermore
  106. needle[i-1-j+table[j]..i-2]
  107. = needle[table[j]..j-1]
  108. = needle[0..j-1-table[j]] (by definition of table[j]). */
  109. j = j - table[j];
  110. }
  111. /* Here: j = i - table[i]. */
  112. }
  113. }
  114. /* Search, using the table to accelerate the processing. */
  115. {
  116. size_t j;
  117. mbui_iterator_t rhaystack;
  118. mbui_iterator_t phaystack;
  119. *resultp = NULL;
  120. j = 0;
  121. mbui_init (rhaystack, haystack);
  122. mbui_init (phaystack, haystack);
  123. /* Invariant: phaystack = rhaystack + j. */
  124. while (mbui_avail (phaystack))
  125. if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
  126. {
  127. j++;
  128. mbui_advance (phaystack);
  129. if (j == m)
  130. {
  131. /* The entire needle has been found. */
  132. *resultp = mbui_cur_ptr (rhaystack);
  133. break;
  134. }
  135. }
  136. else if (j > 0)
  137. {
  138. /* Found a match of needle[0..j-1], mismatch at needle[j]. */
  139. size_t count = table[j];
  140. j -= count;
  141. for (; count > 0; count--)
  142. {
  143. if (!mbui_avail (rhaystack))
  144. abort ();
  145. mbui_advance (rhaystack);
  146. }
  147. }
  148. else
  149. {
  150. /* Found a mismatch at needle[0] already. */
  151. if (!mbui_avail (rhaystack))
  152. abort ();
  153. mbui_advance (rhaystack);
  154. mbui_advance (phaystack);
  155. }
  156. }
  157. freea (memory);
  158. return true;
  159. }
  160. /* Find the first occurrence of the character string NEEDLE in the character
  161. string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */
  162. char *
  163. mbsstr (const char *haystack, const char *needle)
  164. {
  165. /* Be careful not to look at the entire extent of haystack or needle
  166. until needed. This is useful because of these two cases:
  167. - haystack may be very long, and a match of needle found early,
  168. - needle may be very long, and not even a short initial segment of
  169. needle may be found in haystack. */
  170. if (MB_CUR_MAX > 1)
  171. {
  172. mbui_iterator_t iter_needle;
  173. mbui_init (iter_needle, needle);
  174. if (mbui_avail (iter_needle))
  175. {
  176. /* Minimizing the worst-case complexity:
  177. Let n = mbslen(haystack), m = mbslen(needle).
  178. The naïve algorithm is O(n*m) worst-case.
  179. The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
  180. memory allocation.
  181. To achieve linear complexity and yet amortize the cost of the
  182. memory allocation, we activate the Knuth-Morris-Pratt algorithm
  183. only once the naïve algorithm has already run for some time; more
  184. precisely, when
  185. - the outer loop count is >= 10,
  186. - the average number of comparisons per outer loop is >= 5,
  187. - the total number of comparisons is >= m.
  188. But we try it only once. If the memory allocation attempt failed,
  189. we don't retry it. */
  190. bool try_kmp = true;
  191. size_t outer_loop_count = 0;
  192. size_t comparison_count = 0;
  193. size_t last_ccount = 0; /* last comparison count */
  194. mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
  195. mbui_iterator_t iter_haystack;
  196. mbui_init (iter_needle_last_ccount, needle);
  197. mbui_init (iter_haystack, haystack);
  198. for (;; mbui_advance (iter_haystack))
  199. {
  200. if (!mbui_avail (iter_haystack))
  201. /* No match. */
  202. return NULL;
  203. /* See whether it's advisable to use an asymptotically faster
  204. algorithm. */
  205. if (try_kmp
  206. && outer_loop_count >= 10
  207. && comparison_count >= 5 * outer_loop_count)
  208. {
  209. /* See if needle + comparison_count now reaches the end of
  210. needle. */
  211. size_t count = comparison_count - last_ccount;
  212. for (;
  213. count > 0 && mbui_avail (iter_needle_last_ccount);
  214. count--)
  215. mbui_advance (iter_needle_last_ccount);
  216. last_ccount = comparison_count;
  217. if (!mbui_avail (iter_needle_last_ccount))
  218. {
  219. /* Try the Knuth-Morris-Pratt algorithm. */
  220. const char *result;
  221. bool success =
  222. knuth_morris_pratt_multibyte (haystack, needle,
  223. &result);
  224. if (success)
  225. return (char *) result;
  226. try_kmp = false;
  227. }
  228. }
  229. outer_loop_count++;
  230. comparison_count++;
  231. if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
  232. /* The first character matches. */
  233. {
  234. mbui_iterator_t rhaystack;
  235. mbui_iterator_t rneedle;
  236. memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
  237. mbui_advance (rhaystack);
  238. mbui_init (rneedle, needle);
  239. if (!mbui_avail (rneedle))
  240. abort ();
  241. mbui_advance (rneedle);
  242. for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
  243. {
  244. if (!mbui_avail (rneedle))
  245. /* Found a match. */
  246. return (char *) mbui_cur_ptr (iter_haystack);
  247. if (!mbui_avail (rhaystack))
  248. /* No match. */
  249. return NULL;
  250. comparison_count++;
  251. if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
  252. /* Nothing in this round. */
  253. break;
  254. }
  255. }
  256. }
  257. }
  258. else
  259. return (char *) haystack;
  260. }
  261. else
  262. {
  263. if (*needle != '\0')
  264. {
  265. /* Minimizing the worst-case complexity:
  266. Let n = strlen(haystack), m = strlen(needle).
  267. The naïve algorithm is O(n*m) worst-case.
  268. The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
  269. memory allocation.
  270. To achieve linear complexity and yet amortize the cost of the
  271. memory allocation, we activate the Knuth-Morris-Pratt algorithm
  272. only once the naïve algorithm has already run for some time; more
  273. precisely, when
  274. - the outer loop count is >= 10,
  275. - the average number of comparisons per outer loop is >= 5,
  276. - the total number of comparisons is >= m.
  277. But we try it only once. If the memory allocation attempt failed,
  278. we don't retry it. */
  279. bool try_kmp = true;
  280. size_t outer_loop_count = 0;
  281. size_t comparison_count = 0;
  282. size_t last_ccount = 0; /* last comparison count */
  283. const char *needle_last_ccount = needle; /* = needle + last_ccount */
  284. /* Speed up the following searches of needle by caching its first
  285. character. */
  286. char b = *needle++;
  287. for (;; haystack++)
  288. {
  289. if (*haystack == '\0')
  290. /* No match. */
  291. return NULL;
  292. /* See whether it's advisable to use an asymptotically faster
  293. algorithm. */
  294. if (try_kmp
  295. && outer_loop_count >= 10
  296. && comparison_count >= 5 * outer_loop_count)
  297. {
  298. /* See if needle + comparison_count now reaches the end of
  299. needle. */
  300. if (needle_last_ccount != NULL)
  301. {
  302. needle_last_ccount +=
  303. strnlen (needle_last_ccount,
  304. comparison_count - last_ccount);
  305. if (*needle_last_ccount == '\0')
  306. needle_last_ccount = NULL;
  307. last_ccount = comparison_count;
  308. }
  309. if (needle_last_ccount == NULL)
  310. {
  311. /* Try the Knuth-Morris-Pratt algorithm. */
  312. const unsigned char *result;
  313. bool success =
  314. knuth_morris_pratt ((const unsigned char *) haystack,
  315. (const unsigned char *) (needle - 1),
  316. strlen (needle - 1),
  317. &result);
  318. if (success)
  319. return (char *) result;
  320. try_kmp = false;
  321. }
  322. }
  323. outer_loop_count++;
  324. comparison_count++;
  325. if (*haystack == b)
  326. /* The first character matches. */
  327. {
  328. const char *rhaystack = haystack + 1;
  329. const char *rneedle = needle;
  330. for (;; rhaystack++, rneedle++)
  331. {
  332. if (*rneedle == '\0')
  333. /* Found a match. */
  334. return (char *) haystack;
  335. if (*rhaystack == '\0')
  336. /* No match. */
  337. return NULL;
  338. comparison_count++;
  339. if (*rhaystack != *rneedle)
  340. /* Nothing in this round. */
  341. break;
  342. }
  343. }
  344. }
  345. }
  346. else
  347. return (char *) haystack;
  348. }
  349. }