PageRenderTime 25ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/posix/bug-regex20.c

https://gitlab.com/Namal/glibc
C | 286 lines | 243 code | 16 blank | 27 comment | 18 complexity | 4e31f7a60674c5fb3693e614da947375 MD5 | raw file
  1. /* Test for UTF-8 regular expression optimizations.
  2. Copyright (C) 2003-2016 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
  5. The GNU C Library is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU Lesser General Public
  7. License as published by the Free Software Foundation; either
  8. version 2.1 of the License, or (at your option) any later version.
  9. The GNU C Library is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. Lesser General Public License for more details.
  13. You should have received a copy of the GNU Lesser General Public
  14. License along with the GNU C Library; if not, see
  15. <http://www.gnu.org/licenses/>. */
  16. #include <sys/types.h>
  17. #include <mcheck.h>
  18. #include <regex.h>
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <string.h>
  22. #include <locale.h>
  23. #define RE_NO_INTERNAL_PROTOTYPES 1
  24. #include "regex_internal.h"
  25. #define BRE RE_SYNTAX_POSIX_BASIC
  26. #define ERE RE_SYNTAX_POSIX_EXTENDED
  27. static struct
  28. {
  29. int syntax;
  30. const char *pattern;
  31. const char *string;
  32. int res, optimize;
  33. } tests[] = {
  34. /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS
  35. \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS
  36. \xc3\xa4 LATIN SMALL LETTER A WITH DIAERESIS
  37. \xc3\xb6 LATIN SMALL LETTER O WITH DIAERESIS
  38. \xe2\x80\x94 EM DASH */
  39. /* Should be optimized. */
  40. {BRE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
  41. {BRE, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
  42. {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
  43. {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1},
  44. {BRE, "b\xc3\xa4\\+z", "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1},
  45. {BRE, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
  46. {BRE, "b\xc3\xa4\\{1,2\\}z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
  47. {BRE, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1},
  48. {BRE, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
  49. {BRE, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1},
  50. {BRE, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1},
  51. {BRE, "^x\\|x\xc3\xa4*z$", "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1},
  52. {BRE, "^x\\\\\xc3\x84\\{6\\}z\\+",
  53. "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
  54. {BRE, "^x\\\\\xc3\x84\\{2,36\\}z\\+", "x\\\xc3\x84zz\xc3\xb6", -1, 1},
  55. {BRE, "^x\\\\\xc3\x84\\{,3\\}z\\+",
  56. "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
  57. {BRE, "x[C]y", "axCy", 1, 1},
  58. {BRE, "x[ABC]y", "axCy", 1, 1},
  59. {BRE, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1},
  60. {BRE, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
  61. {BRE, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
  62. {BRE, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1},
  63. {BRE, "\\(x\xc3\x84\\)z\\1\x61\\1",
  64. "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1},
  65. {BRE, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
  66. {BRE, "x.y", "ax\xe2\x80\x94yz", 1, 1},
  67. {BRE, "x.*z", "\xc3\x84xz", 2, 1},
  68. {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1},
  69. {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1},
  70. {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1},
  71. {BRE, "x.\\?z", "axz", 1, 1},
  72. {BRE, "x.\\?z", "axyz", 1, 1},
  73. {BRE, "x.\\?z", "ax\xc3\x84z", 1, 1},
  74. {BRE, "x.\\?z", "ax\xe2\x80\x94z", 1, 1},
  75. {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80z", 1, 1},
  76. {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84z", 1, 1},
  77. {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1},
  78. {BRE, ".", "y", 0, 1},
  79. {BRE, ".", "\xc3\x84", 0, 1},
  80. {BRE, ".", "\xe2\x80\x94", 0, 1},
  81. {BRE, ".", "\xf0\x9d\x80\x80", 0, 1},
  82. {BRE, ".", "\xf9\x81\x82\x83\x84", 0, 1},
  83. {BRE, ".", "\xfd\xbf\xbf\xbf\xbf\xbf", 0, 1},
  84. {BRE, "x.\\?z", "axyyz", -1, 1},
  85. {BRE, "x.\\?z", "ax\xc3\x84\xc3\x96z", -1, 1},
  86. {BRE, "x.\\?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1},
  87. {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80yz", -1, 1},
  88. {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1},
  89. {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1},
  90. {BRE, "x.\\+z", "\xe2\x80\x94xz", -1, 1},
  91. {BRE, "x.\\+z", "\xe2\x80\x94xyz", 3, 1},
  92. {BRE, "x.\\+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1},
  93. {BRE, "x.\\+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
  94. {BRE, "x.\\+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
  95. {BRE, "x.\\+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1},
  96. {BRE, "x.\\+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
  97. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xz", -1, 1},
  98. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1},
  99. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xyz", 3, 1},
  100. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1},
  101. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
  102. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
  103. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1},
  104. {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
  105. {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "axz", 1, 1},
  106. {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1},
  107. {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xc3\x86z", 1, 1},
  108. {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xe2\x80\x96wz", 1, 1},
  109. {ERE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
  110. {ERE, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1},
  111. {ERE, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
  112. {ERE, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, 1},
  113. {ERE, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, 1},
  114. {ERE, "x[C]y", "axCy", 1, 1},
  115. {ERE, "x[ABC]y", "axCy", 1, 1},
  116. {ERE, "\\`x|z\\'", "x\xe2\x80\x94", 0, 1},
  117. {ERE, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
  118. {ERE, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1},
  119. {ERE, "x.y", "ax\xe2\x80\x94yz", 1, 1},
  120. {ERE, "x.*z", "\xc3\x84xz", 2, 1},
  121. {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1},
  122. {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1},
  123. {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1},
  124. {ERE, "x.?z", "axz", 1, 1},
  125. {ERE, "x.?z", "axyz", 1, 1},
  126. {ERE, "x.?z", "ax\xc3\x84z", 1, 1},
  127. {ERE, "x.?z", "ax\xe2\x80\x94z", 1, 1},
  128. {ERE, "x.?z", "ax\xf0\x9d\x80\x80z", 1, 1},
  129. {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84z", 1, 1},
  130. {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1},
  131. {ERE, "x.?z", "axyyz", -1, 1},
  132. {ERE, "x.?z", "ax\xc3\x84\xc3\x96z", -1, 1},
  133. {ERE, "x.?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1},
  134. {ERE, "x.?z", "ax\xf0\x9d\x80\x80yz", -1, 1},
  135. {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1},
  136. {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1},
  137. {ERE, "x.+z", "\xe2\x80\x94xz", -1, 1},
  138. {ERE, "x.+z", "\xe2\x80\x94xyz", 3, 1},
  139. {ERE, "x.+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1},
  140. {ERE, "x.+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
  141. {ERE, "x.+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
  142. {ERE, "x.+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1},
  143. {ERE, "x.+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
  144. {ERE, "x.{1,2}z", "\xe2\x80\x94xz", -1, 1},
  145. {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1},
  146. {ERE, "x.{1,2}z", "\xe2\x80\x94xyz", 3, 1},
  147. {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1},
  148. {ERE, "x.{1,2}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1},
  149. {ERE, "x.{1,2}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1},
  150. {ERE, "x.{1,2}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1},
  151. {ERE, "x.{1,2}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1},
  152. {ERE, "x(.w|\xc3\x86)?z", "axz", 1, 1},
  153. {ERE, "x(.w|\xc3\x86)?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1},
  154. {ERE, "x(.w|\xc3\x86)?z", "ax\xc3\x86z", 1, 1},
  155. {ERE, "x(.w|\xc3\x86)?z", "ax\xe2\x80\x96wz", 1, 1},
  156. /* Should not be optimized. */
  157. {BRE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
  158. {BRE, "x[A-Z,]y", "axCy", 1, 0},
  159. {BRE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
  160. {BRE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
  161. {BRE, "x[[=A=]]z", "axAz", 1, 0},
  162. {BRE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0},
  163. {BRE, "\\<g", "\xe2\x80\x94g", 3, 0},
  164. {BRE, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
  165. {BRE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
  166. {BRE, "a\\wz", "a\xc3\x84z", 0, 0},
  167. {BRE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
  168. {ERE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
  169. {ERE, "x[A-Z,]y", "axCy", 1, 0},
  170. {ERE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
  171. {ERE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
  172. {ERE, "x[[=A=]]z", "axAz", 1, 0},
  173. {ERE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0},
  174. {ERE, "\\<g", "\xe2\x80\x94g", 3, 0},
  175. {ERE, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
  176. {ERE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
  177. {ERE, "a\\wz", "a\xc3\x84z", 0, 0},
  178. {ERE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
  179. };
  180. int
  181. main (void)
  182. {
  183. struct re_pattern_buffer regbuf;
  184. const char *err;
  185. size_t i;
  186. int ret = 0;
  187. mtrace ();
  188. setlocale (LC_ALL, "de_DE.UTF-8");
  189. for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
  190. {
  191. int res, optimized;
  192. re_set_syntax (tests[i].syntax);
  193. memset (&regbuf, '\0', sizeof (regbuf));
  194. err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
  195. &regbuf);
  196. if (err != NULL)
  197. {
  198. printf ("re_compile_pattern failed: %s\n", err);
  199. ret = 1;
  200. continue;
  201. }
  202. /* Check if re_search will be done as multi-byte or single-byte. */
  203. optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1;
  204. if (optimized != tests[i].optimize)
  205. {
  206. printf ("pattern %zd %soptimized while it should%s be\n",
  207. i, optimized ? "" : "not ", tests[i].optimize ? "" : " not");
  208. ret = 1;
  209. }
  210. int str_len = strlen (tests[i].string);
  211. res = re_search (&regbuf, tests[i].string, str_len, 0, str_len, NULL);
  212. if (res != tests[i].res)
  213. {
  214. printf ("re_search %zd failed: %d\n", i, res);
  215. ret = 1;
  216. regfree (&regbuf);
  217. continue;
  218. }
  219. res = re_search (&regbuf, tests[i].string, str_len, str_len, -str_len,
  220. NULL);
  221. if (res != tests[i].res)
  222. {
  223. printf ("backward re_search %zd failed: %d\n", i, res);
  224. ret = 1;
  225. regfree (&regbuf);
  226. continue;
  227. }
  228. regfree (&regbuf);
  229. re_set_syntax (tests[i].syntax | RE_ICASE);
  230. memset (&regbuf, '\0', sizeof (regbuf));
  231. err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
  232. &regbuf);
  233. if (err != NULL)
  234. {
  235. printf ("re_compile_pattern failed: %s\n", err);
  236. ret = 1;
  237. continue;
  238. }
  239. /* Check if re_search will be done as multi-byte or single-byte. */
  240. optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1;
  241. if (optimized)
  242. {
  243. printf ("pattern %zd optimized while it should not be when case insensitive\n",
  244. i);
  245. ret = 1;
  246. }
  247. res = re_search (&regbuf, tests[i].string, str_len, 0, str_len, NULL);
  248. if (res != tests[i].res)
  249. {
  250. printf ("ICASE re_search %zd failed: %d\n", i, res);
  251. ret = 1;
  252. regfree (&regbuf);
  253. continue;
  254. }
  255. res = re_search (&regbuf, tests[i].string, str_len, str_len, -str_len,
  256. NULL);
  257. if (res != tests[i].res)
  258. {
  259. printf ("ICASE backward re_search %zd failed: %d\n", i, res);
  260. ret = 1;
  261. regfree (&regbuf);
  262. continue;
  263. }
  264. regfree (&regbuf);
  265. }
  266. return ret;
  267. }