PageRenderTime 56ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/sys/src/cmd/awk/re.c

https://bitbucket.org/mischief/plan9front
C | 325 lines | 247 code | 25 blank | 53 comment | 89 complexity | 551cc8230d6566cb43179e48a434de74 MD5 | raw file
Possible License(s): AGPL-1.0, GPL-2.0, BSD-3-Clause, LGPL-3.0, 0BSD
  1. /****************************************************************
  2. Copyright (C) Lucent Technologies 1997
  3. All Rights Reserved
  4. Permission to use, copy, modify, and distribute this software and
  5. its documentation for any purpose and without fee is hereby
  6. granted, provided that the above copyright notice appear in all
  7. copies and that both that the copyright notice and this
  8. permission notice and warranty disclaimer appear in supporting
  9. documentation, and that the name Lucent Technologies or any of
  10. its entities not be used in advertising or publicity pertaining
  11. to distribution of the software without specific, written prior
  12. permission.
  13. LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  14. INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  15. IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  16. SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  17. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  18. IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  19. ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  20. THIS SOFTWARE.
  21. ****************************************************************/
  22. #define DEBUG
  23. #include <stdio.h>
  24. #include <ctype.h>
  25. #include <setjmp.h>
  26. #include <math.h>
  27. #include <string.h>
  28. #include <stdlib.h>
  29. #include <time.h>
  30. #include "awk.h"
  31. #include "y.tab.h"
  32. #include "regexp.h"
  33. /* This file provides the interface between the main body of
  34. * awk and the pattern matching package. It preprocesses
  35. * patterns prior to compilation to provide awk-like semantics
  36. * to character sequences not supported by the pattern package.
  37. * The following conversions are performed:
  38. *
  39. * "()" -> "[]"
  40. * "[-" -> "[\-"
  41. * "[^-" -> "[^\-"
  42. * "-]" -> "\-]"
  43. * "[]" -> "[]*"
  44. * "\xdddd" -> "\z" where 'z' is the UTF sequence
  45. * for the hex value
  46. * "\ddd" -> "\o" where 'o' is a char octal value
  47. * "\b" -> "\B" where 'B' is backspace
  48. * "\t" -> "\T" where 'T' is tab
  49. * "\f" -> "\F" where 'F' is form feed
  50. * "\n" -> "\N" where 'N' is newline
  51. * "\r" -> "\r" where 'C' is cr
  52. */
  53. #define MAXRE 512
  54. static char re[MAXRE]; /* copy buffer */
  55. char *patbeg;
  56. int patlen; /* number of chars in pattern */
  57. #define NPATS 20 /* number of slots in pattern cache */
  58. static struct pat_list /* dynamic pattern cache */
  59. {
  60. char *re;
  61. int use;
  62. Reprog *program;
  63. } pattern[NPATS];
  64. static int npats; /* cache fill level */
  65. /* Compile a pattern */
  66. void
  67. *compre(char *pat)
  68. {
  69. int i, j, inclass;
  70. char c, *p, *s;
  71. Reprog *program;
  72. if (!compile_time) { /* search cache for dynamic pattern */
  73. for (i = 0; i < npats; i++)
  74. if (!strcmp(pat, pattern[i].re)) {
  75. pattern[i].use++;
  76. return((void *) pattern[i].program);
  77. }
  78. }
  79. /* Preprocess Pattern for compilation */
  80. p = re;
  81. s = pat;
  82. inclass = 0;
  83. while (c = *s++) {
  84. if (c == '\\') {
  85. quoted(&s, &p, re+MAXRE);
  86. continue;
  87. }
  88. else if (!inclass && c == '(' && *s == ')') {
  89. if (p < re+MAXRE-2) { /* '()' -> '[]*' */
  90. *p++ = '[';
  91. *p++ = ']';
  92. c = '*';
  93. s++;
  94. }
  95. else overflow();
  96. }
  97. else if (c == '['){ /* '[-' -> '[\-' */
  98. inclass = 1;
  99. if (*s == '-') {
  100. if (p < re+MAXRE-2) {
  101. *p++ = '[';
  102. *p++ = '\\';
  103. c = *s++;
  104. }
  105. else overflow();
  106. } /* '[^-' -> '[^\-'*/
  107. else if (*s == '^' && s[1] == '-'){
  108. if (p < re+MAXRE-3) {
  109. *p++ = '[';
  110. *p++ = *s++;
  111. *p++ = '\\';
  112. c = *s++;
  113. }
  114. else overflow();
  115. }
  116. else if (*s == '['){ /* skip '[[' */
  117. if (p < re+MAXRE-1)
  118. *p++ = c;
  119. else overflow();
  120. c = *s++;
  121. }
  122. else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
  123. if (p < re+MAXRE-2) {
  124. *p++ = c;
  125. *p++ = *s++;
  126. c = *s++;
  127. }
  128. else overflow();
  129. }
  130. else if (*s == ']') { /* '[]' -> '[]*' */
  131. if (p < re+MAXRE-2) {
  132. *p++ = c;
  133. *p++ = *s++;
  134. c = '*';
  135. inclass = 0;
  136. }
  137. else overflow();
  138. }
  139. }
  140. else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
  141. if (p < re+MAXRE-1)
  142. *p++ = '\\';
  143. else overflow();
  144. }
  145. else if (c == ']')
  146. inclass = 0;
  147. if (p < re+MAXRE-1)
  148. *p++ = c;
  149. else overflow();
  150. }
  151. *p = 0;
  152. program = regcomp(re); /* compile pattern */
  153. if (!compile_time) {
  154. if (npats < NPATS) /* Room in cache */
  155. i = npats++;
  156. else { /* Throw out least used */
  157. int use = pattern[0].use;
  158. i = 0;
  159. for (j = 1; j < NPATS; j++) {
  160. if (pattern[j].use < use) {
  161. use = pattern[j].use;
  162. i = j;
  163. }
  164. }
  165. xfree(pattern[i].program);
  166. xfree(pattern[i].re);
  167. }
  168. pattern[i].re = tostring(pat);
  169. pattern[i].program = program;
  170. pattern[i].use = 1;
  171. }
  172. return((void *) program);
  173. }
  174. /* T/F match indication - matched string not exported */
  175. int
  176. match(void *p, char *s, char *)
  177. {
  178. return regexec((Reprog *) p, (char *) s, 0, 0);
  179. }
  180. /* match and delimit the matched string */
  181. int
  182. pmatch(void *p, char *s, char *start)
  183. {
  184. Resub m;
  185. m.s.sp = start;
  186. m.e.ep = 0;
  187. if (regexec((Reprog *) p, (char *) s, &m, 1)) {
  188. patbeg = m.s.sp;
  189. patlen = m.e.ep-m.s.sp;
  190. return 1;
  191. }
  192. patlen = -1;
  193. patbeg = start;
  194. return 0;
  195. }
  196. /* perform a non-empty match */
  197. int
  198. nematch(void *p, char *s, char *start)
  199. {
  200. if (pmatch(p, s, start) == 1 && patlen > 0)
  201. return 1;
  202. patlen = -1;
  203. patbeg = start;
  204. return 0;
  205. }
  206. /* in the parsing of regular expressions, metacharacters like . have */
  207. /* to be seen literally; \056 is not a metacharacter. */
  208. hexstr(char **pp) /* find and eval hex string at pp, return new p */
  209. {
  210. char c;
  211. int n = 0;
  212. int i;
  213. for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
  214. if (isdigit(c))
  215. n = 16 * n + c - '0';
  216. else if ('a' <= c && c <= 'f')
  217. n = 16 * n + c - 'a' + 10;
  218. else if ('A' <= c && c <= 'F')
  219. n = 16 * n + c - 'A' + 10;
  220. }
  221. *pp += i;
  222. return n;
  223. }
  224. /* look for awk-specific escape sequences */
  225. #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
  226. void
  227. quoted(char **s, char **to, char *end) /* handle escaped sequence */
  228. {
  229. char *p = *s;
  230. char *t = *to;
  231. wchar_t c;
  232. switch(c = *p++) {
  233. case 't':
  234. c = '\t';
  235. break;
  236. case 'n':
  237. c = '\n';
  238. break;
  239. case 'f':
  240. c = '\f';
  241. break;
  242. case 'r':
  243. c = '\r';
  244. break;
  245. case 'b':
  246. c = '\b';
  247. break;
  248. default:
  249. if (t < end-1) /* all else must be escaped */
  250. *t++ = '\\';
  251. if (c == 'x') { /* hexadecimal goo follows */
  252. c = hexstr(&p);
  253. if (t < end-MB_CUR_MAX)
  254. t += wctomb(t, c);
  255. else overflow();
  256. *to = t;
  257. *s = p;
  258. return;
  259. } else if (isoctdigit(c)) { /* \d \dd \ddd */
  260. c -= '0';
  261. if (isoctdigit(*p)) {
  262. c = 8 * c + *p++ - '0';
  263. if (isoctdigit(*p))
  264. c = 8 * c + *p++ - '0';
  265. }
  266. }
  267. break;
  268. }
  269. if (t < end-1)
  270. *t++ = c;
  271. *s = p;
  272. *to = t;
  273. }
  274. /* count rune positions */
  275. int
  276. countposn(char *s, int n)
  277. {
  278. int i, j;
  279. char *end;
  280. for (i = 0, end = s+n; *s && s < end; i++){
  281. j = mblen(s, n);
  282. if(j <= 0)
  283. j = 1;
  284. s += j;
  285. }
  286. return(i);
  287. }
  288. /* pattern package error handler */
  289. void
  290. regerror(char *s)
  291. {
  292. FATAL("%s", s);
  293. }
  294. void
  295. overflow(void)
  296. {
  297. FATAL("%s", "regular expression too big");
  298. }