PageRenderTime 58ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/src/cmd/awk/re.c

https://bitbucket.org/clivehayward/plan9port
C | 325 lines | 246 code | 26 blank | 53 comment | 89 complexity | b46f9eedaf3bd397583c8b51608d115c MD5 | raw file
Possible License(s): LGPL-2.1, MPL-2.0-no-copyleft-exception, Unlicense
  1. /****************************************************************
  2. Copyright (C) Lucent Technologies 1997
  3. All Rights Reserved
  4. Permission to use, copy, modify, and distribute this software and
  5. its documentation for any purpose and without fee is hereby
  6. granted, provided that the above copyright notice appear in all
  7. copies and that both that the copyright notice and this
  8. permission notice and warranty disclaimer appear in supporting
  9. documentation, and that the name Lucent Technologies or any of
  10. its entities not be used in advertising or publicity pertaining
  11. to distribution of the software without specific, written prior
  12. permission.
  13. LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  14. INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  15. IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  16. SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  17. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  18. IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  19. ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  20. THIS SOFTWARE.
  21. ****************************************************************/
  22. #define DEBUG
  23. #include <stdio.h>
  24. #include <u.h>
  25. #include <libc.h>
  26. #include <ctype.h>
  27. #include <bio.h>
  28. #include <regexp.h>
  29. #include "awk.h"
  30. #include "y.tab.h"
  31. /* This file provides the interface between the main body of
  32. * awk and the pattern matching package. It preprocesses
  33. * patterns prior to compilation to provide awk-like semantics
  34. * to character sequences not supported by the pattern package.
  35. * The following conversions are performed:
  36. *
  37. * "()" -> "[]"
  38. * "[-" -> "[\-"
  39. * "[^-" -> "[^\-"
  40. * "-]" -> "\-]"
  41. * "[]" -> "[]*"
  42. * "\xdddd" -> "\z" where 'z' is the UTF sequence
  43. * for the hex value
  44. * "\ddd" -> "\o" where 'o' is a char octal value
  45. * "\b" -> "\B" where 'B' is backspace
  46. * "\t" -> "\T" where 'T' is tab
  47. * "\f" -> "\F" where 'F' is form feed
  48. * "\n" -> "\N" where 'N' is newline
  49. * "\r" -> "\r" where 'C' is cr
  50. */
  51. #define MAXRE 512
  52. static char re[MAXRE]; /* copy buffer */
  53. char *patbeg;
  54. int patlen; /* number of chars in pattern */
  55. #define NPATS 20 /* number of slots in pattern cache */
  56. static struct pat_list /* dynamic pattern cache */
  57. {
  58. char *re;
  59. int use;
  60. Reprog *program;
  61. } pattern[NPATS];
  62. static int npats; /* cache fill level */
  63. /* Compile a pattern */
  64. void
  65. *compre(char *pat)
  66. {
  67. int i, j, inclass;
  68. char c, *p, *s;
  69. Reprog *program;
  70. if (!compile_time) { /* search cache for dynamic pattern */
  71. for (i = 0; i < npats; i++)
  72. if (!strcmp(pat, pattern[i].re)) {
  73. pattern[i].use++;
  74. return((void *) pattern[i].program);
  75. }
  76. }
  77. /* Preprocess Pattern for compilation */
  78. p = re;
  79. s = pat;
  80. inclass = 0;
  81. while (c = *s++) {
  82. if (c == '\\') {
  83. quoted(&s, &p, re+MAXRE);
  84. continue;
  85. }
  86. else if (!inclass && c == '(' && *s == ')') {
  87. if (p < re+MAXRE-2) { /* '()' -> '[]*' */
  88. *p++ = '[';
  89. *p++ = ']';
  90. c = '*';
  91. s++;
  92. }
  93. else overflow();
  94. }
  95. else if (c == '['){ /* '[-' -> '[\-' */
  96. inclass = 1;
  97. if (*s == '-') {
  98. if (p < re+MAXRE-2) {
  99. *p++ = '[';
  100. *p++ = '\\';
  101. c = *s++;
  102. }
  103. else overflow();
  104. } /* '[^-' -> '[^\-'*/
  105. else if (*s == '^' && s[1] == '-'){
  106. if (p < re+MAXRE-3) {
  107. *p++ = '[';
  108. *p++ = *s++;
  109. *p++ = '\\';
  110. c = *s++;
  111. }
  112. else overflow();
  113. }
  114. else if (*s == '['){ /* skip '[[' */
  115. if (p < re+MAXRE-1)
  116. *p++ = c;
  117. else overflow();
  118. c = *s++;
  119. }
  120. else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
  121. if (p < re+MAXRE-2) {
  122. *p++ = c;
  123. *p++ = *s++;
  124. c = *s++;
  125. }
  126. else overflow();
  127. }
  128. else if (*s == ']') { /* '[]' -> '[]*' */
  129. if (p < re+MAXRE-2) {
  130. *p++ = c;
  131. *p++ = *s++;
  132. c = '*';
  133. inclass = 0;
  134. }
  135. else overflow();
  136. }
  137. }
  138. else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
  139. if (p < re+MAXRE-1)
  140. *p++ = '\\';
  141. else overflow();
  142. }
  143. else if (c == ']')
  144. inclass = 0;
  145. if (p < re+MAXRE-1)
  146. *p++ = c;
  147. else overflow();
  148. }
  149. *p = 0;
  150. program = regcomp(re); /* compile pattern */
  151. if (!compile_time) {
  152. if (npats < NPATS) /* Room in cache */
  153. i = npats++;
  154. else { /* Throw out least used */
  155. int use = pattern[0].use;
  156. i = 0;
  157. for (j = 1; j < NPATS; j++) {
  158. if (pattern[j].use < use) {
  159. use = pattern[j].use;
  160. i = j;
  161. }
  162. }
  163. xfree(pattern[i].program);
  164. xfree(pattern[i].re);
  165. }
  166. pattern[i].re = tostring(pat);
  167. pattern[i].program = program;
  168. pattern[i].use = 1;
  169. }
  170. return((void *) program);
  171. }
  172. /* T/F match indication - matched string not exported */
  173. int
  174. match(void *p, char *s, char *start)
  175. {
  176. return regexec((Reprog *) p, (char *) s, 0, 0);
  177. }
  178. /* match and delimit the matched string */
  179. int
  180. pmatch(void *p, char *s, char *start)
  181. {
  182. Resub m;
  183. m.s.sp = start;
  184. m.e.ep = 0;
  185. if (regexec((Reprog *) p, (char *) s, &m, 1)) {
  186. patbeg = m.s.sp;
  187. patlen = m.e.ep-m.s.sp;
  188. return 1;
  189. }
  190. patlen = -1;
  191. patbeg = start;
  192. return 0;
  193. }
  194. /* perform a non-empty match */
  195. int
  196. nematch(void *p, char *s, char *start)
  197. {
  198. if (pmatch(p, s, start) == 1 && patlen > 0)
  199. return 1;
  200. patlen = -1;
  201. patbeg = start;
  202. return 0;
  203. }
  204. /* in the parsing of regular expressions, metacharacters like . have */
  205. /* to be seen literally; \056 is not a metacharacter. */
  206. int
  207. hexstr(char **pp) /* find and eval hex string at pp, return new p */
  208. {
  209. char c;
  210. int n = 0;
  211. int i;
  212. for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
  213. if (isdigit(c))
  214. n = 16 * n + c - '0';
  215. else if ('a' <= c && c <= 'f')
  216. n = 16 * n + c - 'a' + 10;
  217. else if ('A' <= c && c <= 'F')
  218. n = 16 * n + c - 'A' + 10;
  219. }
  220. *pp += i;
  221. return n;
  222. }
  223. /* look for awk-specific escape sequences */
  224. #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
  225. void
  226. quoted(char **s, char **to, char *end) /* handle escaped sequence */
  227. {
  228. char *p = *s;
  229. char *t = *to;
  230. wchar_t c;
  231. switch(c = *p++) {
  232. case 't':
  233. c = '\t';
  234. break;
  235. case 'n':
  236. c = '\n';
  237. break;
  238. case 'f':
  239. c = '\f';
  240. break;
  241. case 'r':
  242. c = '\r';
  243. break;
  244. case 'b':
  245. c = '\b';
  246. break;
  247. default:
  248. if (t < end-1) /* all else must be escaped */
  249. *t++ = '\\';
  250. if (c == 'x') { /* hexadecimal goo follows */
  251. c = hexstr(&p);
  252. if (t < end-MB_CUR_MAX)
  253. t += wctomb(t, c);
  254. else overflow();
  255. *to = t;
  256. *s = p;
  257. return;
  258. } else if (isoctdigit(c)) { /* \d \dd \ddd */
  259. c -= '0';
  260. if (isoctdigit(*p)) {
  261. c = 8 * c + *p++ - '0';
  262. if (isoctdigit(*p))
  263. c = 8 * c + *p++ - '0';
  264. }
  265. }
  266. break;
  267. }
  268. if (t < end-1)
  269. *t++ = c;
  270. *s = p;
  271. *to = t;
  272. }
  273. /* count rune positions */
  274. int
  275. countposn(char *s, int n)
  276. {
  277. int i, j;
  278. char *end;
  279. for (i = 0, end = s+n; *s && s < end; i++){
  280. j = mblen(s, n);
  281. if(j <= 0)
  282. j = 1;
  283. s += j;
  284. }
  285. return(i);
  286. }
  287. /* pattern package error handler */
  288. void
  289. regerror(char *s)
  290. {
  291. FATAL("%s", s);
  292. }
  293. void
  294. overflow(void)
  295. {
  296. FATAL("%s", "regular expression too big");
  297. }