PageRenderTime 52ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/src/regex.c

http://github.com/lukesandberg/Regex
C | 289 lines | 268 code | 14 blank | 7 comment | 46 complexity | 5e34d942f238f05039e1cafea8018b0b MD5 | raw file
  1. #include <re_compiler.h>
  2. #include <re_ast.h>
  3. #include <util/util.h>
  4. #include <util/sparse_map.h>
  5. #include <thread_state.h>
  6. #include <re_parser.h>
  7. #include <capture_group.h>
  8. #include <ctype.h>
  9. #include <stdio.h>
  10. #include <string.h>
  11. #include <regex.h>
  12. struct re_run_state
  13. {
  14. regex* re;
  15. char* str;
  16. ts_cache* cache;
  17. sparse_map* clst;
  18. sparse_map* nlst;
  19. capture_group** r_caps;
  20. };
  21. static int add_to_list(struct re_run_state *state, sparse_map* map, unsigned int pc_index, thread_state* ts, char* c)
  22. {
  23. unsigned int reg_val = 0;
  24. tailcall:
  25. if(!sparse_map_contains(map, pc_index))
  26. {
  27. sparse_map_set(map, pc_index, ts);
  28. instruction *pc = state->re->prog.code + pc_index;
  29. switch(pc->op)
  30. {
  31. case I_JMP:
  32. pc_index = pc->v.jump;
  33. goto tailcall;
  34. case I_SPLIT:
  35. ts_incref(ts);
  36. //can't avoid recursion here
  37. if(add_to_list(state, map, pc->v.split.left, ts, c) == 0)
  38. return 0;//propgate error
  39. pc_index = pc->v.split.right;
  40. goto tailcall;
  41. case I_SAVE:
  42. ts = ts_update(state->cache, ts, pc->v.save_register, (unsigned int) (c - state->str));
  43. if(ts == NULL)
  44. return 0;
  45. pc_index++;
  46. goto tailcall;
  47. case I_DGTEQ:
  48. reg_val = ts->regs[pc->v.comparison.idx];
  49. if(reg_val >= pc->v.comparison.comp)
  50. {
  51. ts_decref(state->cache, ts);
  52. }
  53. else
  54. {
  55. pc_index++;
  56. goto tailcall;
  57. }
  58. break;
  59. case I_DLT:
  60. reg_val = ts->regs[pc->v.comparison.idx];
  61. if(reg_val < pc->v.comparison.comp)
  62. {
  63. ts_decref(state->cache, ts);
  64. }
  65. else
  66. {
  67. pc_index++;
  68. goto tailcall;
  69. }
  70. break;
  71. case I_SETZ:
  72. ts = ts_update(state->cache, ts, pc->v.idx, 0);
  73. pc_index++;
  74. goto tailcall;
  75. case I_INCR:
  76. reg_val = ts->regs[pc->v.idx];
  77. ts = ts_update(state->cache, ts, pc->v.idx, reg_val + 1);
  78. pc_index++;
  79. goto tailcall;
  80. case I_CHAR:
  81. case I_WHITESPACE:
  82. case I_WILDCARD:
  83. case I_ALPHA:
  84. case I_DIGIT:
  85. case I_MATCH:
  86. //fallthrough, we could just give an empty default case
  87. //but this will give us warnings if we add new instructions
  88. //to make sure it is properly handled here
  89. ;
  90. }
  91. }
  92. return 1;
  93. }
  94. static void free_list(struct re_run_state* state, sparse_map* lst)
  95. {
  96. for(unsigned int i = 0; i < sparse_map_num_entries(lst); i++)
  97. {
  98. void* val = NULL;
  99. sparse_map_get_entry(lst, i, &val);
  100. if(val != NULL)
  101. {
  102. ts_decref(state->cache, (thread_state*) val);
  103. }
  104. }
  105. free_sparse_map(lst);
  106. }
  107. static inline capture_group* extract_capture_groups(struct re_run_state* state, thread_state* ts)
  108. {
  109. capture_group* cg = NEWE(capture_group, sizeof(char*) * state->re->num_capture_regs);
  110. if(cg == NULL)
  111. return NULL;
  112. unsigned int len = state->re->num_capture_regs;
  113. char* str = state->str;
  114. for(unsigned int i = 0; i < len; i++)
  115. {
  116. cg->regs[i] = str + ts->regs[i];
  117. }
  118. cg->sz = len;
  119. return cg;
  120. }
  121. static int init_state(regex* re, char* str, capture_group** r_caps, struct re_run_state* state)
  122. {
  123. size_t len = re->prog.size;
  124. state->r_caps = r_caps;
  125. state->re = re;
  126. state->str = str;
  127. state->cache = make_ts_cache(len);
  128. if(state->cache == NULL)
  129. return 0;
  130. state->clst = make_sparse_map(len);
  131. if(state->clst == NULL)
  132. {
  133. free_ts_cache(state->cache);
  134. return 0;
  135. }
  136. state->nlst = make_sparse_map(len);
  137. if(state->nlst == NULL)
  138. {
  139. free_ts_cache(state->cache);
  140. free_list(state, state->clst);
  141. return 0;
  142. }
  143. return 1;
  144. }
  145. static void swap_lists(struct re_run_state* state)
  146. {
  147. sparse_map* tmp = state->nlst;
  148. state->nlst = state->clst;
  149. sparse_map_clear(state->nlst);
  150. state->clst = tmp;
  151. }
  152. static void free_state(struct re_run_state* state)
  153. {
  154. free_list(state, state->nlst);
  155. free_list(state, state->clst);
  156. free_ts_cache(state->cache);
  157. }
  158. static int process_char(struct re_run_state* state, char* c)
  159. {
  160. for(unsigned int i = 0; i < sparse_map_num_entries(state->clst); i++)
  161. {
  162. void* val = NULL;
  163. unsigned int pc_index = sparse_map_get_entry(state->clst, i, &val);
  164. instruction* pc = state->re->prog.code + pc_index;
  165. thread_state* ts = (thread_state*) val;
  166. int v = 0;
  167. switch(pc->op)
  168. {
  169. case I_CHAR:
  170. v = (pc->v.c == *c);
  171. break;
  172. case I_ALPHA:
  173. v = isalpha(*c);
  174. break;
  175. case I_WHITESPACE:
  176. v = isspace(*c);
  177. break;
  178. case I_DIGIT:
  179. v = isdigit(*c);
  180. break;
  181. case I_WILDCARD:
  182. v = (*c != '\0');
  183. break;
  184. case I_MATCH:
  185. v = 0;//we never actually go past this
  186. if(*c == '\0')//we are at the end (all matches are anchored)
  187. //by default
  188. {
  189. if(state->r_caps != NULL)
  190. {
  191. *(state->r_caps) = extract_capture_groups(state, ts);
  192. if(*(state->r_caps) == NULL) //extraction failed
  193. {
  194. return -1;
  195. }
  196. }
  197. return 1;
  198. }
  199. break;
  200. case I_JMP:
  201. case I_SPLIT:
  202. case I_SAVE:
  203. case I_DGTEQ:
  204. case I_DLT:
  205. case I_SETZ:
  206. case I_INCR:
  207. v = -1;
  208. //skip over control flow because we already processed it in add_to_list
  209. }
  210. if(v > 0)//we did pass the test
  211. {
  212. if(!add_to_list(state, state->nlst, pc_index + 1, ts, c + 1))
  213. {
  214. return -1;
  215. }
  216. }
  217. else if(v == 0)
  218. {
  219. //thread death
  220. ts_decref(state->cache, ts);
  221. }
  222. }
  223. return 0;
  224. }
  225. int regex_matches(regex* re, char*str, capture_group** r_caps)
  226. {
  227. int rval = -1;
  228. char* c = str;
  229. thread_state* ts;
  230. struct re_run_state state;
  231. if(!init_state(re, str, r_caps, &state))
  232. return -1;
  233. ts = make_thread_state(state.cache, re->num_registers);
  234. if(ts == NULL)
  235. goto end;
  236. if(!add_to_list(&state, state.clst, 0, ts, c))
  237. goto end;
  238. rval = 0;
  239. do
  240. {
  241. int v = process_char(&state, c);
  242. if(v != 0)
  243. {
  244. rval = v;
  245. break;
  246. }
  247. swap_lists(&state);
  248. } while(*c++ != '\0');
  249. end:
  250. free_state(&state);
  251. return rval;
  252. }
  253. regex* regex_create(char* re_str, re_error* er)
  254. {
  255. ast_node* tree = re_parse(re_str, er);
  256. if(tree == NULL)
  257. return NULL;//there was an error during parsing
  258. regex * re = compile_regex(tree);
  259. free_node(tree);
  260. if(re == NULL)
  261. {
  262. if(er != NULL)
  263. {
  264. er->errno = E_OUT_OF_MEMORY;
  265. er->position = -1;
  266. }
  267. return NULL;
  268. }
  269. re->src = re_str;
  270. return re;
  271. }
  272. void regex_destroy(regex* re)
  273. {
  274. rfree(re);
  275. }