PageRenderTime 77ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 1ms

/lpeg.c

https://gitlab.com/g10h4ck/nmap-gsoc2015
C | 3706 lines | 2569 code | 513 blank | 624 comment | 489 complexity | 8375c196726095a13fc4ca029271b715 MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, Apache-2.0, LGPL-2.0, LGPL-2.1, MIT

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. ** $Id: lptypes.h,v 1.8 2013/04/12 16:26:38 roberto Exp $
  3. ** LPeg - PEG pattern matching for Lua
  4. ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
  5. ** written by Roberto Ierusalimschy
  6. */
  7. #if !defined(lptypes_h)
  8. #define lptypes_h
  9. #if !defined(LPEG_DEBUG)
  10. #define NDEBUG
  11. #endif
  12. #include <assert.h>
  13. #include <limits.h>
  14. #include "lua.h"
  15. #define VERSION "0.12"
  16. #define PATTERN_T "lpeg-pattern"
  17. #define MAXSTACKIDX "lpeg-maxstack"
  18. /*
  19. ** compatibility with Lua 5.2
  20. */
  21. #if (LUA_VERSION_NUM == 502)
  22. #undef lua_equal
  23. #define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ)
  24. #undef lua_getfenv
  25. #define lua_getfenv lua_getuservalue
  26. #undef lua_setfenv
  27. #define lua_setfenv lua_setuservalue
  28. #undef lua_objlen
  29. #define lua_objlen lua_rawlen
  30. #undef luaL_register
  31. #define luaL_register(L,n,f) \
  32. { if ((n) == NULL) luaL_setfuncs(L,f,0); else luaL_newlib(L,f); }
  33. #endif
  34. /* default maximum size for call/backtrack stack */
  35. #if !defined(MAXBACK)
  36. #define MAXBACK 100
  37. #endif
  38. /* maximum number of rules in a grammar */
  39. #define MAXRULES 200
  40. /* initial size for capture's list */
  41. #define INITCAPSIZE 32
  42. /* index, on Lua stack, for subject */
  43. #define SUBJIDX 2
  44. /* number of fixed arguments to 'match' (before capture arguments) */
  45. #define FIXEDARGS 3
  46. /* index, on Lua stack, for capture list */
  47. #define caplistidx(ptop) ((ptop) + 2)
  48. /* index, on Lua stack, for pattern's ktable */
  49. #define ktableidx(ptop) ((ptop) + 3)
  50. /* index, on Lua stack, for backtracking stack */
  51. #define stackidx(ptop) ((ptop) + 4)
  52. typedef unsigned char byte;
  53. #define BITSPERCHAR 8
  54. #define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1)
  55. typedef struct Charset {
  56. byte cs[CHARSETSIZE];
  57. } Charset;
  58. #define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} }
  59. /* access to charset */
  60. #define treebuffer(t) ((byte *)((t) + 1))
  61. /* number of slots needed for 'n' bytes */
  62. #define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1)
  63. /* set 'b' bit in charset 'cs' */
  64. #define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7)))
  65. /*
  66. ** in capture instructions, 'kind' of capture and its offset are
  67. ** packed in field 'aux', 4 bits for each
  68. */
  69. #define getkind(op) ((op)->i.aux & 0xF)
  70. #define getoff(op) (((op)->i.aux >> 4) & 0xF)
  71. #define joinkindoff(k,o) ((k) | ((o) << 4))
  72. #define MAXOFF 0xF
  73. #define MAXAUX 0xFF
  74. /* maximum number of bytes to look behind */
  75. #define MAXBEHIND MAXAUX
  76. /* maximum size (in elements) for a pattern */
  77. #define MAXPATTSIZE (SHRT_MAX - 10)
  78. /* size (in elements) for an instruction plus extra l bytes */
  79. #define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1)
  80. /* size (in elements) for a ISet instruction */
  81. #define CHARSETINSTSIZE instsize(CHARSETSIZE)
  82. /* size (in elements) for a IFunc instruction */
  83. #define funcinstsize(p) ((p)->i.aux + 2)
  84. #define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7))))
  85. #endif
  86. /*
  87. ** $Id: lptree.h,v 1.2 2013/03/24 13:51:12 roberto Exp $
  88. */
  89. #if !defined(lptree_h)
  90. #define lptree_h
  91. /*
  92. ** types of trees
  93. */
  94. typedef enum TTag {
  95. TChar = 0, TSet, TAny, /* standard PEG elements */
  96. TTrue, TFalse,
  97. TRep,
  98. TSeq, TChoice,
  99. TNot, TAnd,
  100. TCall,
  101. TOpenCall,
  102. TRule, /* sib1 is rule's pattern, sib2 is 'next' rule */
  103. TGrammar, /* sib1 is initial (and first) rule */
  104. TBehind, /* match behind */
  105. TCapture, /* regular capture */
  106. TRunTime /* run-time capture */
  107. } TTag;
  108. /* number of siblings for each tree */
  109. extern const byte numsiblings[];
  110. /*
  111. ** Tree trees
  112. ** The first sibling of a tree (if there is one) is immediately after
  113. ** the tree. A reference to a second sibling (ps) is its position
  114. ** relative to the position of the tree itself. A key in ktable
  115. ** uses the (unique) address of the original tree that created that
  116. ** entry. NULL means no data.
  117. */
  118. typedef struct TTree {
  119. byte tag;
  120. byte cap; /* kind of capture (if it is a capture) */
  121. unsigned short key; /* key in ktable for Lua data (0 if no key) */
  122. union {
  123. int ps; /* occasional second sibling */
  124. int n; /* occasional counter */
  125. } u;
  126. } TTree;
  127. /*
  128. ** A complete pattern has its tree plus, if already compiled,
  129. ** its corresponding code
  130. */
  131. typedef struct Pattern {
  132. union Instruction *code;
  133. int codesize;
  134. TTree tree[1];
  135. } Pattern;
  136. /* number of siblings for each tree */
  137. extern const byte numsiblings[];
  138. /* access to siblings */
  139. #define sib1(t) ((t) + 1)
  140. #define sib2(t) ((t) + (t)->u.ps)
  141. #endif
  142. /*
  143. ** $Id: lpcap.h,v 1.1 2013/03/21 20:25:12 roberto Exp $
  144. */
  145. #if !defined(lpcap_h)
  146. #define lpcap_h
  147. /* kinds of captures */
  148. typedef enum CapKind {
  149. Cclose, Cposition, Cconst, Cbackref, Carg, Csimple, Ctable, Cfunction,
  150. Cquery, Cstring, Cnum, Csubst, Cfold, Cruntime, Cgroup
  151. } CapKind;
  152. typedef struct Capture {
  153. const char *s; /* subject position */
  154. short idx; /* extra info about capture (group name, arg index, etc.) */
  155. byte kind; /* kind of capture */
  156. byte siz; /* size of full capture + 1 (0 = not a full capture) */
  157. } Capture;
  158. typedef struct CapState {
  159. Capture *cap; /* current capture */
  160. Capture *ocap; /* (original) capture list */
  161. lua_State *L;
  162. int ptop; /* index of last argument to 'match' */
  163. const char *s; /* original string */
  164. int valuecached; /* value stored in cache slot */
  165. } CapState;
  166. int runtimecap (CapState *cs, Capture *close, const char *s, int *rem);
  167. int getcaptures (lua_State *L, const char *s, const char *r, int ptop);
  168. int finddyncap (Capture *cap, Capture *last);
  169. #endif
  170. /*
  171. ** $Id: lpvm.h,v 1.2 2013/04/03 20:37:18 roberto Exp $
  172. */
  173. #if !defined(lpvm_h)
  174. #define lpvm_h
  175. /* Virtual Machine's instructions */
  176. typedef enum Opcode {
  177. IAny, /* if no char, fail */
  178. IChar, /* if char != aux, fail */
  179. ISet, /* if char not in buff, fail */
  180. ITestAny, /* in no char, jump to 'offset' */
  181. ITestChar, /* if char != aux, jump to 'offset' */
  182. ITestSet, /* if char not in buff, jump to 'offset' */
  183. ISpan, /* read a span of chars in buff */
  184. IBehind, /* walk back 'aux' characters (fail if not possible) */
  185. IRet, /* return from a rule */
  186. IEnd, /* end of pattern */
  187. IChoice, /* stack a choice; next fail will jump to 'offset' */
  188. IJmp, /* jump to 'offset' */
  189. ICall, /* call rule at 'offset' */
  190. IOpenCall, /* call rule number 'key' (must be closed to a ICall) */
  191. ICommit, /* pop choice and jump to 'offset' */
  192. IPartialCommit, /* update top choice to current position and jump */
  193. IBackCommit, /* "fails" but jump to its own 'offset' */
  194. IFailTwice, /* pop one choice and then fail */
  195. IFail, /* go back to saved state on choice and jump to saved offset */
  196. IGiveup, /* internal use */
  197. IFullCapture, /* complete capture of last 'off' chars */
  198. IOpenCapture, /* start a capture */
  199. ICloseCapture,
  200. ICloseRunTime
  201. } Opcode;
  202. typedef union Instruction {
  203. struct Inst {
  204. byte code;
  205. byte aux;
  206. short key;
  207. } i;
  208. int offset;
  209. byte buff[1];
  210. } Instruction;
  211. int getposition (lua_State *L, int t, int i);
  212. void printpatt (Instruction *p, int n);
  213. const char *match (lua_State *L, const char *o, const char *s, const char *e,
  214. Instruction *op, Capture *capture, int ptop);
  215. int verify (lua_State *L, Instruction *op, const Instruction *p,
  216. Instruction *e, int postable, int rule);
  217. void checkrule (lua_State *L, Instruction *op, int from, int to,
  218. int postable, int rule);
  219. #endif
  220. /*
  221. ** $Id: lpcode.h,v 1.5 2013/04/04 21:24:45 roberto Exp $
  222. */
  223. #if !defined(lpcode_h)
  224. #define lpcode_h
  225. #include "lua.h"
  226. int tocharset (TTree *tree, Charset *cs);
  227. int checkaux (TTree *tree, int pred);
  228. int fixedlenx (TTree *tree, int count, int len);
  229. int hascaptures (TTree *tree);
  230. int lp_gc (lua_State *L);
  231. Instruction *compile (lua_State *L, Pattern *p);
  232. void reallocprog (lua_State *L, Pattern *p, int nsize);
  233. int sizei (const Instruction *i);
  234. #define PEnullable 0
  235. #define PEnofail 1
  236. #define nofail(t) checkaux(t, PEnofail)
  237. #define nullable(t) checkaux(t, PEnullable)
  238. #define fixedlen(t) fixedlenx(t, 0, 0)
  239. #endif
  240. /*
  241. ** $Id: lpprint.h,v 1.1 2013/03/21 20:25:12 roberto Exp $
  242. */
  243. #if !defined(lpprint_h)
  244. #define lpprint_h
  245. #if defined(LPEG_DEBUG)
  246. void printpatt (Instruction *p, int n);
  247. void printtree (TTree *tree, int ident);
  248. void printktable (lua_State *L, int idx);
  249. void printcharset (const byte *st);
  250. void printcaplist (Capture *cap, Capture *limit);
  251. #else
  252. #define printktable(L,idx) \
  253. luaL_error(L, "function only implemented in debug mode")
  254. #define printtree(tree,i) \
  255. luaL_error(L, "function only implemented in debug mode")
  256. #define printpatt(p,n) \
  257. luaL_error(L, "function only implemented in debug mode")
  258. #endif
  259. #endif
  260. /*
  261. ** $Id: lpcap.c,v 1.4 2013/03/21 20:25:12 roberto Exp $
  262. ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
  263. */
  264. #include "lua.h"
  265. #include "lauxlib.h"
  266. #define captype(cap) ((cap)->kind)
  267. #define isclosecap(cap) (captype(cap) == Cclose)
  268. #define closeaddr(c) ((c)->s + (c)->siz - 1)
  269. #define isfullcap(cap) ((cap)->siz != 0)
  270. #define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v)
  271. #define pushluaval(cs) getfromktable(cs, (cs)->cap->idx)
  272. /*
  273. ** Put at the cache for Lua values the value indexed by 'v' in ktable
  274. ** of the running pattern (if it is not there yet); returns its index.
  275. */
  276. static int updatecache (CapState *cs, int v) {
  277. int idx = cs->ptop + 1; /* stack index of cache for Lua values */
  278. if (v != cs->valuecached) { /* not there? */
  279. getfromktable(cs, v); /* get value from 'ktable' */
  280. lua_replace(cs->L, idx); /* put it at reserved stack position */
  281. cs->valuecached = v; /* keep track of what is there */
  282. }
  283. return idx;
  284. }
  285. static int pushcapture (CapState *cs);
  286. /*
  287. ** Goes back in a list of captures looking for an open capture
  288. ** corresponding to a close
  289. */
  290. static Capture *findopen (Capture *cap) {
  291. int n = 0; /* number of closes waiting an open */
  292. for (;;) {
  293. cap--;
  294. if (isclosecap(cap)) n++; /* one more open to skip */
  295. else if (!isfullcap(cap))
  296. if (n-- == 0) return cap;
  297. }
  298. }
  299. /*
  300. ** Go to the next capture
  301. */
  302. static void nextcap (CapState *cs) {
  303. Capture *cap = cs->cap;
  304. if (!isfullcap(cap)) { /* not a single capture? */
  305. int n = 0; /* number of opens waiting a close */
  306. for (;;) { /* look for corresponding close */
  307. cap++;
  308. if (isclosecap(cap)) {
  309. if (n-- == 0) break;
  310. }
  311. else if (!isfullcap(cap)) n++;
  312. }
  313. }
  314. cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */
  315. }
  316. /*
  317. ** Push on the Lua stack all values generated by nested captures inside
  318. ** the current capture. Returns number of values pushed. 'addextra'
  319. ** makes it push the entire match after all captured values. The
  320. ** entire match is pushed also if there are no other nested values,
  321. ** so the function never returns zero.
  322. */
  323. static int pushnestedvalues (CapState *cs, int addextra) {
  324. Capture *co = cs->cap;
  325. if (isfullcap(cs->cap++)) { /* no nested captures? */
  326. lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */
  327. return 1; /* that is it */
  328. }
  329. else {
  330. int n = 0;
  331. while (!isclosecap(cs->cap)) /* repeat for all nested patterns */
  332. n += pushcapture(cs);
  333. if (addextra || n == 0) { /* need extra? */
  334. lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */
  335. n++;
  336. }
  337. cs->cap++; /* skip close entry */
  338. return n;
  339. }
  340. }
  341. /*
  342. ** Push only the first value generated by nested captures
  343. */
  344. static void pushonenestedvalue (CapState *cs) {
  345. int n = pushnestedvalues(cs, 0);
  346. if (n > 1)
  347. lua_pop(cs->L, n - 1); /* pop extra values */
  348. }
  349. /*
  350. ** Try to find a named group capture with the name given at the top of
  351. ** the stack; goes backward from 'cap'.
  352. */
  353. static Capture *findback (CapState *cs, Capture *cap) {
  354. lua_State *L = cs->L;
  355. while (cap-- > cs->ocap) { /* repeat until end of list */
  356. if (isclosecap(cap))
  357. cap = findopen(cap); /* skip nested captures */
  358. else if (!isfullcap(cap))
  359. continue; /* opening an enclosing capture: skip and get previous */
  360. if (captype(cap) == Cgroup) {
  361. getfromktable(cs, cap->idx); /* get group name */
  362. if (lua_equal(L, -2, -1)) { /* right group? */
  363. lua_pop(L, 2); /* remove reference name and group name */
  364. return cap;
  365. }
  366. else lua_pop(L, 1); /* remove group name */
  367. }
  368. }
  369. luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1));
  370. return NULL; /* to avoid warnings */
  371. }
  372. /*
  373. ** Back-reference capture. Return number of values pushed.
  374. */
  375. static int backrefcap (CapState *cs) {
  376. int n;
  377. Capture *curr = cs->cap;
  378. pushluaval(cs); /* reference name */
  379. cs->cap = findback(cs, curr); /* find corresponding group */
  380. n = pushnestedvalues(cs, 0); /* push group's values */
  381. cs->cap = curr + 1;
  382. return n;
  383. }
  384. /*
  385. ** Table capture: creates a new table and populates it with nested
  386. ** captures.
  387. */
  388. static int tablecap (CapState *cs) {
  389. lua_State *L = cs->L;
  390. int n = 0;
  391. lua_newtable(L);
  392. if (isfullcap(cs->cap++))
  393. return 1; /* table is empty */
  394. while (!isclosecap(cs->cap)) {
  395. if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */
  396. pushluaval(cs); /* push group name */
  397. pushonenestedvalue(cs);
  398. lua_settable(L, -3);
  399. }
  400. else { /* not a named group */
  401. int i;
  402. int k = pushcapture(cs);
  403. for (i = k; i > 0; i--) /* store all values into table */
  404. lua_rawseti(L, -(i + 1), n + i);
  405. n += k;
  406. }
  407. }
  408. cs->cap++; /* skip close entry */
  409. return 1; /* number of values pushed (only the table) */
  410. }
  411. /*
  412. ** Table-query capture
  413. */
  414. static int querycap (CapState *cs) {
  415. int idx = cs->cap->idx;
  416. pushonenestedvalue(cs); /* get nested capture */
  417. lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */
  418. if (!lua_isnil(cs->L, -1))
  419. return 1;
  420. else { /* no value */
  421. lua_pop(cs->L, 1); /* remove nil */
  422. return 0;
  423. }
  424. }
  425. /*
  426. ** Fold capture
  427. */
  428. static int foldcap (CapState *cs) {
  429. int n;
  430. lua_State *L = cs->L;
  431. int idx = cs->cap->idx;
  432. if (isfullcap(cs->cap++) || /* no nested captures? */
  433. isclosecap(cs->cap) || /* no nested captures (large subject)? */
  434. (n = pushcapture(cs)) == 0) /* nested captures with no values? */
  435. return luaL_error(L, "no initial value for fold capture");
  436. if (n > 1)
  437. lua_pop(L, n - 1); /* leave only one result for accumulator */
  438. while (!isclosecap(cs->cap)) {
  439. lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */
  440. lua_insert(L, -2); /* put it before accumulator */
  441. n = pushcapture(cs); /* get next capture's values */
  442. lua_call(L, n + 1, 1); /* call folding function */
  443. }
  444. cs->cap++; /* skip close entry */
  445. return 1; /* only accumulator left on the stack */
  446. }
  447. /*
  448. ** Function capture
  449. */
  450. static int functioncap (CapState *cs) {
  451. int n;
  452. int top = lua_gettop(cs->L);
  453. pushluaval(cs); /* push function */
  454. n = pushnestedvalues(cs, 0); /* push nested captures */
  455. lua_call(cs->L, n, LUA_MULTRET); /* call function */
  456. return lua_gettop(cs->L) - top; /* return function's results */
  457. }
  458. /*
  459. ** Select capture
  460. */
  461. static int numcap (CapState *cs) {
  462. int idx = cs->cap->idx; /* value to select */
  463. if (idx == 0) { /* no values? */
  464. nextcap(cs); /* skip entire capture */
  465. return 0; /* no value produced */
  466. }
  467. else {
  468. int n = pushnestedvalues(cs, 0);
  469. if (n < idx) /* invalid index? */
  470. return luaL_error(cs->L, "no capture '%d'", idx);
  471. else {
  472. lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */
  473. lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */
  474. lua_pop(cs->L, n - 1); /* remove other captures */
  475. return 1;
  476. }
  477. }
  478. }
  479. /*
  480. ** Return the stack index of the first runtime capture in the given
  481. ** list of captures (or zero if no runtime captures)
  482. */
  483. int finddyncap (Capture *cap, Capture *last) {
  484. for (; cap < last; cap++) {
  485. if (cap->kind == Cruntime)
  486. return cap->idx; /* stack position of first capture */
  487. }
  488. return 0; /* no dynamic captures in this segment */
  489. }
  490. /*
  491. ** Calls a runtime capture. Returns number of captures removed by
  492. ** the call, including the initial Cgroup. (Captures to be added are
  493. ** on the Lua stack.)
  494. */
  495. int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) {
  496. int n, id;
  497. lua_State *L = cs->L;
  498. int otop = lua_gettop(L);
  499. Capture *open = findopen(close);
  500. assert(captype(open) == Cgroup);
  501. id = finddyncap(open, close); /* get first dynamic capture argument */
  502. close->kind = Cclose; /* closes the group */
  503. close->s = s;
  504. cs->cap = open; cs->valuecached = 0; /* prepare capture state */
  505. luaL_checkstack(L, 4, "too many runtime captures");
  506. pushluaval(cs); /* push function to be called */
  507. lua_pushvalue(L, SUBJIDX); /* push original subject */
  508. lua_pushinteger(L, s - cs->s + 1); /* push current position */
  509. n = pushnestedvalues(cs, 0); /* push nested captures */
  510. lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */
  511. if (id > 0) { /* are there old dynamic captures to be removed? */
  512. int i;
  513. for (i = id; i <= otop; i++)
  514. lua_remove(L, id); /* remove old dynamic captures */
  515. *rem = otop - id + 1; /* total number of dynamic captures removed */
  516. }
  517. else
  518. *rem = 0; /* no dynamic captures removed */
  519. return close - open; /* number of captures of all kinds removed */
  520. }
  521. /*
  522. ** Auxiliary structure for substitution and string captures: keep
  523. ** information about nested captures for future use, avoiding to push
  524. ** string results into Lua
  525. */
  526. typedef struct StrAux {
  527. int isstring; /* whether capture is a string */
  528. union {
  529. Capture *cp; /* if not a string, respective capture */
  530. struct { /* if it is a string... */
  531. const char *s; /* ... starts here */
  532. const char *e; /* ... ends here */
  533. } s;
  534. } u;
  535. } StrAux;
  536. #define MAXSTRCAPS 10
  537. /*
  538. ** Collect values from current capture into array 'cps'. Current
  539. ** capture must be Cstring (first call) or Csimple (recursive calls).
  540. ** (In first call, fills %0 with whole match for Cstring.)
  541. ** Returns number of elements in the array that were filled.
  542. */
  543. static int getstrcaps (CapState *cs, StrAux *cps, int n) {
  544. int k = n++;
  545. cps[k].isstring = 1; /* get string value */
  546. cps[k].u.s.s = cs->cap->s; /* starts here */
  547. if (!isfullcap(cs->cap++)) { /* nested captures? */
  548. while (!isclosecap(cs->cap)) { /* traverse them */
  549. if (n >= MAXSTRCAPS) /* too many captures? */
  550. nextcap(cs); /* skip extra captures (will not need them) */
  551. else if (captype(cs->cap) == Csimple) /* string? */
  552. n = getstrcaps(cs, cps, n); /* put info. into array */
  553. else {
  554. cps[n].isstring = 0; /* not a string */
  555. cps[n].u.cp = cs->cap; /* keep original capture */
  556. nextcap(cs);
  557. n++;
  558. }
  559. }
  560. cs->cap++; /* skip close */
  561. }
  562. cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */
  563. return n;
  564. }
  565. /*
  566. ** add next capture value (which should be a string) to buffer 'b'
  567. */
  568. static int addonestring (luaL_Buffer *b, CapState *cs, const char *what);
  569. /*
  570. ** String capture: add result to buffer 'b' (instead of pushing
  571. ** it into the stack)
  572. */
  573. static void stringcap (luaL_Buffer *b, CapState *cs) {
  574. StrAux cps[MAXSTRCAPS];
  575. int n;
  576. size_t len, i;
  577. const char *fmt; /* format string */
  578. fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len);
  579. n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */
  580. for (i = 0; i < len; i++) { /* traverse them */
  581. if (fmt[i] != '%') /* not an escape? */
  582. luaL_addchar(b, fmt[i]); /* add it to buffer */
  583. else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */
  584. luaL_addchar(b, fmt[i]); /* add to buffer */
  585. else {
  586. int l = fmt[i] - '0'; /* capture index */
  587. if (l > n)
  588. luaL_error(cs->L, "invalid capture index (%d)", l);
  589. else if (cps[l].isstring)
  590. luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s);
  591. else {
  592. Capture *curr = cs->cap;
  593. cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */
  594. if (!addonestring(b, cs, "capture"))
  595. luaL_error(cs->L, "no values in capture index %d", l);
  596. cs->cap = curr; /* continue from where it stopped */
  597. }
  598. }
  599. }
  600. }
  601. /*
  602. ** Substitution capture: add result to buffer 'b'
  603. */
  604. static void substcap (luaL_Buffer *b, CapState *cs) {
  605. const char *curr = cs->cap->s;
  606. if (isfullcap(cs->cap)) /* no nested captures? */
  607. luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */
  608. else {
  609. cs->cap++; /* skip open entry */
  610. while (!isclosecap(cs->cap)) { /* traverse nested captures */
  611. const char *next = cs->cap->s;
  612. luaL_addlstring(b, curr, next - curr); /* add text up to capture */
  613. if (addonestring(b, cs, "replacement"))
  614. curr = closeaddr(cs->cap - 1); /* continue after match */
  615. else /* no capture value */
  616. curr = next; /* keep original text in final result */
  617. }
  618. luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */
  619. }
  620. cs->cap++; /* go to next capture */
  621. }
  622. /*
  623. ** Evaluates a capture and adds its first value to buffer 'b'; returns
  624. ** whether there was a value
  625. */
  626. static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) {
  627. switch (captype(cs->cap)) {
  628. case Cstring:
  629. stringcap(b, cs); /* add capture directly to buffer */
  630. return 1;
  631. case Csubst:
  632. substcap(b, cs); /* add capture directly to buffer */
  633. return 1;
  634. default: {
  635. lua_State *L = cs->L;
  636. int n = pushcapture(cs);
  637. if (n > 0) {
  638. if (n > 1) lua_pop(L, n - 1); /* only one result */
  639. if (!lua_isstring(L, -1))
  640. luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1));
  641. luaL_addvalue(b);
  642. }
  643. return n;
  644. }
  645. }
  646. }
  647. /*
  648. ** Push all values of the current capture into the stack; returns
  649. ** number of values pushed
  650. */
  651. static int pushcapture (CapState *cs) {
  652. lua_State *L = cs->L;
  653. luaL_checkstack(L, 4, "too many captures");
  654. switch (captype(cs->cap)) {
  655. case Cposition: {
  656. lua_pushinteger(L, cs->cap->s - cs->s + 1);
  657. cs->cap++;
  658. return 1;
  659. }
  660. case Cconst: {
  661. pushluaval(cs);
  662. cs->cap++;
  663. return 1;
  664. }
  665. case Carg: {
  666. int arg = (cs->cap++)->idx;
  667. if (arg + FIXEDARGS > cs->ptop)
  668. return luaL_error(L, "reference to absent argument #%d", arg);
  669. lua_pushvalue(L, arg + FIXEDARGS);
  670. return 1;
  671. }
  672. case Csimple: {
  673. int k = pushnestedvalues(cs, 1);
  674. lua_insert(L, -k); /* make whole match be first result */
  675. return k;
  676. }
  677. case Cruntime: {
  678. lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */
  679. return 1;
  680. }
  681. case Cstring: {
  682. luaL_Buffer b;
  683. luaL_buffinit(L, &b);
  684. stringcap(&b, cs);
  685. luaL_pushresult(&b);
  686. return 1;
  687. }
  688. case Csubst: {
  689. luaL_Buffer b;
  690. luaL_buffinit(L, &b);
  691. substcap(&b, cs);
  692. luaL_pushresult(&b);
  693. return 1;
  694. }
  695. case Cgroup: {
  696. if (cs->cap->idx == 0) /* anonymous group? */
  697. return pushnestedvalues(cs, 0); /* add all nested values */
  698. else { /* named group: add no values */
  699. nextcap(cs); /* skip capture */
  700. return 0;
  701. }
  702. }
  703. case Cbackref: return backrefcap(cs);
  704. case Ctable: return tablecap(cs);
  705. case Cfunction: return functioncap(cs);
  706. case Cnum: return numcap(cs);
  707. case Cquery: return querycap(cs);
  708. case Cfold: return foldcap(cs);
  709. default: assert(0); return 0;
  710. }
  711. }
  712. /*
  713. ** Prepare a CapState structure and traverse the entire list of
  714. ** captures in the stack pushing its results. 's' is the subject
  715. ** string, 'r' is the final position of the match, and 'ptop'
  716. ** the index in the stack where some useful values were pushed.
  717. ** Returns the number of results pushed. (If the list produces no
  718. ** results, push the final position of the match.)
  719. */
  720. int getcaptures (lua_State *L, const char *s, const char *r, int ptop) {
  721. Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop));
  722. int n = 0;
  723. if (!isclosecap(capture)) { /* is there any capture? */
  724. CapState cs;
  725. cs.ocap = cs.cap = capture; cs.L = L;
  726. cs.s = s; cs.valuecached = 0; cs.ptop = ptop;
  727. do { /* collect their values */
  728. n += pushcapture(&cs);
  729. } while (!isclosecap(cs.cap));
  730. }
  731. if (n == 0) { /* no capture values? */
  732. lua_pushinteger(L, r - s + 1); /* return only end position */
  733. n = 1;
  734. }
  735. return n;
  736. }
  737. /*
  738. ** $Id: lpcode.c,v 1.18 2013/04/12 16:30:33 roberto Exp $
  739. ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
  740. */
  741. #include <limits.h>
  742. #include "lua.h"
  743. #include "lauxlib.h"
  744. /* signals a "no-instruction */
  745. #define NOINST -1
  746. static const Charset fullset_ =
  747. {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  748. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  749. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  750. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}};
  751. static const Charset *fullset = &fullset_;
  752. /*
  753. ** {======================================================
  754. ** Analysis and some optimizations
  755. ** =======================================================
  756. */
  757. /*
  758. ** Check whether a charset is empty (IFail), singleton (IChar),
  759. ** full (IAny), or none of those (ISet).
  760. */
  761. static Opcode charsettype (const byte *cs, int *c) {
  762. int count = 0;
  763. int i;
  764. int candidate = -1; /* candidate position for a char */
  765. for (i = 0; i < CHARSETSIZE; i++) {
  766. int b = cs[i];
  767. if (b == 0) {
  768. if (count > 1) return ISet; /* else set is still empty */
  769. }
  770. else if (b == 0xFF) {
  771. if (count < (i * BITSPERCHAR))
  772. return ISet;
  773. else count += BITSPERCHAR; /* set is still full */
  774. }
  775. else if ((b & (b - 1)) == 0) { /* byte has only one bit? */
  776. if (count > 0)
  777. return ISet; /* set is neither full nor empty */
  778. else { /* set has only one char till now; track it */
  779. count++;
  780. candidate = i;
  781. }
  782. }
  783. else return ISet; /* byte is neither empty, full, nor singleton */
  784. }
  785. switch (count) {
  786. case 0: return IFail; /* empty set */
  787. case 1: { /* singleton; find character bit inside byte */
  788. int b = cs[candidate];
  789. *c = candidate * BITSPERCHAR;
  790. if ((b & 0xF0) != 0) { *c += 4; b >>= 4; }
  791. if ((b & 0x0C) != 0) { *c += 2; b >>= 2; }
  792. if ((b & 0x02) != 0) { *c += 1; }
  793. return IChar;
  794. }
  795. default: {
  796. assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */
  797. return IAny;
  798. }
  799. }
  800. }
  801. /*
  802. ** A few basic operations on Charsets
  803. */
  804. static void cs_complement (Charset *cs) {
  805. loopset(i, cs->cs[i] = ~cs->cs[i]);
  806. }
  807. static int cs_equal (const byte *cs1, const byte *cs2) {
  808. loopset(i, if (cs1[i] != cs2[i]) return 0);
  809. return 1;
  810. }
  811. /*
  812. ** computes whether sets cs1 and cs2 are disjoint
  813. */
  814. static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
  815. loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;)
  816. return 1;
  817. }
  818. /*
  819. ** Convert a 'char' pattern (TSet, TChar, TAny) to a charset
  820. */
  821. int tocharset (TTree *tree, Charset *cs) {
  822. switch (tree->tag) {
  823. case TSet: { /* copy set */
  824. loopset(i, cs->cs[i] = treebuffer(tree)[i]);
  825. return 1;
  826. }
  827. case TChar: { /* only one char */
  828. assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
  829. loopset(i, cs->cs[i] = 0); /* erase all chars */
  830. setchar(cs->cs, tree->u.n); /* add that one */
  831. return 1;
  832. }
  833. case TAny: {
  834. loopset(i, cs->cs[i] = 0xFF); /* add all to the set */
  835. return 1;
  836. }
  837. default: return 0;
  838. }
  839. }
  840. /*
  841. ** Checks whether a pattern has captures
  842. */
  843. int hascaptures (TTree *tree) {
  844. tailcall:
  845. switch (tree->tag) {
  846. case TCapture: case TRunTime:
  847. return 1;
  848. default: {
  849. switch (numsiblings[tree->tag]) {
  850. case 1: /* return hascaptures(sib1(tree)); */
  851. tree = sib1(tree); goto tailcall;
  852. case 2:
  853. if (hascaptures(sib1(tree))) return 1;
  854. /* else return hascaptures(sib2(tree)); */
  855. tree = sib2(tree); goto tailcall;
  856. default: assert(numsiblings[tree->tag] == 0); return 0;
  857. }
  858. }
  859. }
  860. }
  861. /*
  862. ** Checks how a pattern behaves regarding the empty string,
  863. ** in one of two different ways:
  864. ** A pattern is *nullable* if it can match without consuming any character;
  865. ** A pattern is *nofail* if it never fails for any string
  866. ** (including the empty string).
  867. ** The difference is only for predicates and run-time captures;
  868. ** for other patterns, the two properties are equivalent.
  869. ** (With predicates, &'a' is nullable but not nofail. Of course,
  870. ** nofail => nullable.)
  871. ** These functions are all convervative in the following way:
  872. ** p is nullable => nullable(p)
  873. ** nofail(p) => p cannot fail
  874. ** The function assumes that TOpenCall is not nullable;
  875. ** this will be checked again when the grammar is fixed.)
  876. ** Run-time captures can do whatever they want, so the result
  877. ** is conservative.
  878. */
  879. int checkaux (TTree *tree, int pred) {
  880. tailcall:
  881. switch (tree->tag) {
  882. case TChar: case TSet: case TAny:
  883. case TFalse: case TOpenCall:
  884. return 0; /* not nullable */
  885. case TRep: case TTrue:
  886. return 1; /* no fail */
  887. case TNot: case TBehind: /* can match empty, but can fail */
  888. if (pred == PEnofail) return 0;
  889. else return 1; /* PEnullable */
  890. case TAnd: /* can match empty; fail iff body does */
  891. if (pred == PEnullable) return 1;
  892. /* else return checkaux(sib1(tree), pred); */
  893. tree = sib1(tree); goto tailcall;
  894. case TRunTime: /* can fail; match empty iff body does */
  895. if (pred == PEnofail) return 0;
  896. /* else return checkaux(sib1(tree), pred); */
  897. tree = sib1(tree); goto tailcall;
  898. case TSeq:
  899. if (!checkaux(sib1(tree), pred)) return 0;
  900. /* else return checkaux(sib2(tree), pred); */
  901. tree = sib2(tree); goto tailcall;
  902. case TChoice:
  903. if (checkaux(sib2(tree), pred)) return 1;
  904. /* else return checkaux(sib1(tree), pred); */
  905. tree = sib1(tree); goto tailcall;
  906. case TCapture: case TGrammar: case TRule:
  907. /* return checkaux(sib1(tree), pred); */
  908. tree = sib1(tree); goto tailcall;
  909. case TCall: /* return checkaux(sib2(tree), pred); */
  910. tree = sib2(tree); goto tailcall;
  911. default: assert(0); return 0;
  912. };
  913. }
  914. /*
  915. ** number of characters to match a pattern (or -1 if variable)
  916. ** ('count' avoids infinite loops for grammars)
  917. */
  918. int fixedlenx (TTree *tree, int count, int len) {
  919. tailcall:
  920. switch (tree->tag) {
  921. case TChar: case TSet: case TAny:
  922. return len + 1;
  923. case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
  924. return len;
  925. case TRep: case TRunTime: case TOpenCall:
  926. return -1;
  927. case TCapture: case TRule: case TGrammar:
  928. /* return fixedlenx(sib1(tree), count); */
  929. tree = sib1(tree); goto tailcall;
  930. case TCall:
  931. if (count++ >= MAXRULES)
  932. return -1; /* may be a loop */
  933. /* else return fixedlenx(sib2(tree), count); */
  934. tree = sib2(tree); goto tailcall;
  935. case TSeq: {
  936. len = fixedlenx(sib1(tree), count, len);
  937. if (len < 0) return -1;
  938. /* else return fixedlenx(sib2(tree), count, len); */
  939. tree = sib2(tree); goto tailcall;
  940. }
  941. case TChoice: {
  942. int n1, n2;
  943. n1 = fixedlenx(sib1(tree), count, len);
  944. if (n1 < 0) return -1;
  945. n2 = fixedlenx(sib2(tree), count, len);
  946. if (n1 == n2) return n1;
  947. else return -1;
  948. }
  949. default: assert(0); return 0;
  950. };
  951. }
  952. /*
  953. ** Computes the 'first set' of a pattern.
  954. ** The result is a conservative aproximation:
  955. ** match p ax -> x' for some x ==> a in first(p).
  956. ** The set 'follow' is the first set of what follows the
  957. ** pattern (full set if nothing follows it).
  958. ** The function returns 0 when this set can be used for
  959. ** tests that avoid the pattern altogether.
  960. ** A non-zero return can happen for two reasons:
  961. ** 1) match p '' -> '' ==> returns 1.
  962. ** (tests cannot be used because they always fail for an empty input)
  963. ** 2) there is a match-time capture ==> returns 2.
  964. ** (match-time captures should not be avoided by optimizations)
  965. */
  966. static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
  967. tailcall:
  968. switch (tree->tag) {
  969. case TChar: case TSet: case TAny: {
  970. tocharset(tree, firstset);
  971. return 0;
  972. }
  973. case TTrue: {
  974. loopset(i, firstset->cs[i] = follow->cs[i]);
  975. return 1;
  976. }
  977. case TFalse: {
  978. loopset(i, firstset->cs[i] = 0);
  979. return 0;
  980. }
  981. case TChoice: {
  982. Charset csaux;
  983. int e1 = getfirst(sib1(tree), follow, firstset);
  984. int e2 = getfirst(sib2(tree), follow, &csaux);
  985. loopset(i, firstset->cs[i] |= csaux.cs[i]);
  986. return e1 | e2;
  987. }
  988. case TSeq: {
  989. if (!nullable(sib1(tree))) {
  990. /* return getfirst(sib1(tree), fullset, firstset); */
  991. tree = sib1(tree); follow = fullset; goto tailcall;
  992. }
  993. else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */
  994. Charset csaux;
  995. int e2 = getfirst(sib2(tree), follow, &csaux);
  996. int e1 = getfirst(sib1(tree), &csaux, firstset);
  997. if (e1 == 0) return 0; /* 'e1' ensures that first can be used */
  998. else if ((e1 | e2) & 2) /* one of the children has a matchtime? */
  999. return 2; /* pattern has a matchtime capture */
  1000. else return e2; /* else depends on 'e2' */
  1001. }
  1002. }
  1003. case TRep: {
  1004. getfirst(sib1(tree), follow, firstset);
  1005. loopset(i, firstset->cs[i] |= follow->cs[i]);
  1006. return 1; /* accept the empty string */
  1007. }
  1008. case TCapture: case TGrammar: case TRule: {
  1009. /* return getfirst(sib1(tree), follow, firstset); */
  1010. tree = sib1(tree); goto tailcall;
  1011. }
  1012. case TRunTime: { /* function invalidates any follow info. */
  1013. int e = getfirst(sib1(tree), fullset, firstset);
  1014. if (e) return 2; /* function is not "protected"? */
  1015. else return 0; /* pattern inside capture ensures first can be used */
  1016. }
  1017. case TCall: {
  1018. /* return getfirst(sib2(tree), follow, firstset); */
  1019. tree = sib2(tree); goto tailcall;
  1020. }
  1021. case TAnd: {
  1022. int e = getfirst(sib1(tree), follow, firstset);
  1023. loopset(i, firstset->cs[i] &= follow->cs[i]);
  1024. return e;
  1025. }
  1026. case TNot: {
  1027. if (tocharset(sib1(tree), firstset)) {
  1028. cs_complement(firstset);
  1029. return 1;
  1030. }
  1031. /* else go through */
  1032. }
  1033. case TBehind: { /* instruction gives no new information */
  1034. /* call 'getfirst' to check for math-time captures */
  1035. int e = getfirst(sib1(tree), follow, firstset);
  1036. loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */
  1037. return e | 1; /* always can accept the empty string */
  1038. }
  1039. default: assert(0); return 0;
  1040. }
  1041. }
  1042. /*
  1043. ** If it returns true, then pattern can fail only depending on the next
  1044. ** character of the subject
  1045. */
  1046. static int headfail (TTree *tree) {
  1047. tailcall:
  1048. switch (tree->tag) {
  1049. case TChar: case TSet: case TAny: case TFalse:
  1050. return 1;
  1051. case TTrue: case TRep: case TRunTime: case TNot:
  1052. case TBehind:
  1053. return 0;
  1054. case TCapture: case TGrammar: case TRule: case TAnd:
  1055. tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
  1056. case TCall:
  1057. tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */
  1058. case TSeq:
  1059. if (!nofail(sib2(tree))) return 0;
  1060. /* else return headfail(sib1(tree)); */
  1061. tree = sib1(tree); goto tailcall;
  1062. case TChoice:
  1063. if (!headfail(sib1(tree))) return 0;
  1064. /* else return headfail(sib2(tree)); */
  1065. tree = sib2(tree); goto tailcall;
  1066. default: assert(0); return 0;
  1067. }
  1068. }
  1069. /*
  1070. ** Check whether the code generation for the given tree can benefit
  1071. ** from a follow set (to avoid computing the follow set when it is
  1072. ** not needed)
  1073. */
  1074. static int needfollow (TTree *tree) {
  1075. tailcall:
  1076. switch (tree->tag) {
  1077. case TChar: case TSet: case TAny:
  1078. case TFalse: case TTrue: case TAnd: case TNot:
  1079. case TRunTime: case TGrammar: case TCall: case TBehind:
  1080. return 0;
  1081. case TChoice: case TRep:
  1082. return 1;
  1083. case TCapture:
  1084. tree = sib1(tree); goto tailcall;
  1085. case TSeq:
  1086. tree = sib2(tree); goto tailcall;
  1087. default: assert(0); return 0;
  1088. }
  1089. }
  1090. /* }====================================================== */
  1091. /*
  1092. ** {======================================================
  1093. ** Code generation
  1094. ** =======================================================
  1095. */
  1096. /*
  1097. ** size of an instruction
  1098. */
  1099. int sizei (const Instruction *i) {
  1100. switch((Opcode)i->i.code) {
  1101. case ISet: case ISpan: return CHARSETINSTSIZE;
  1102. case ITestSet: return CHARSETINSTSIZE + 1;
  1103. case ITestChar: case ITestAny: case IChoice: case IJmp:
  1104. case ICall: case IOpenCall: case ICommit: case IPartialCommit:
  1105. case IBackCommit: return 2;
  1106. default: return 1;
  1107. }
  1108. }
  1109. /*
  1110. ** state for the compiler
  1111. */
  1112. typedef struct CompileState {
  1113. Pattern *p; /* pattern being compiled */
  1114. int ncode; /* next position in p->code to be filled */
  1115. lua_State *L;
  1116. } CompileState;
  1117. /*
  1118. ** code generation is recursive; 'opt' indicates that the code is
  1119. ** being generated under a 'IChoice' operator jumping to its end.
  1120. ** 'tt' points to a previous test protecting this code. 'fl' is
  1121. ** the follow set of the pattern.
  1122. */
  1123. static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
  1124. const Charset *fl);
  1125. void reallocprog (lua_State *L, Pattern *p, int nsize) {
  1126. void *ud;
  1127. lua_Alloc f = lua_getallocf(L, &ud);
  1128. void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction),
  1129. nsize * sizeof(Instruction));
  1130. if (newblock == NULL && nsize > 0)
  1131. luaL_error(L, "not enough memory");
  1132. p->code = (Instruction *)newblock;
  1133. p->codesize = nsize;
  1134. }
  1135. static int nextinstruction (CompileState *compst) {
  1136. int size = compst->p->codesize;
  1137. if (compst->ncode >= size)
  1138. reallocprog(compst->L, compst->p, size * 2);
  1139. return compst->ncode++;
  1140. }
  1141. #define getinstr(cs,i) ((cs)->p->code[i])
  1142. static int addinstruction (CompileState *compst, Opcode op, int aux) {
  1143. int i = nextinstruction(compst);
  1144. getinstr(compst, i).i.code = op;
  1145. getinstr(compst, i).i.aux = aux;
  1146. return i;
  1147. }
  1148. static int addoffsetinst (CompileState *compst, Opcode op) {
  1149. int i = addinstruction(compst, op, 0); /* instruction */
  1150. addinstruction(compst, (Opcode)0, 0); /* open space for offset */
  1151. assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2);
  1152. return i;
  1153. }
  1154. static void setoffset (CompileState *compst, int instruction, int offset) {
  1155. getinstr(compst, instruction + 1).offset = offset;
  1156. }
  1157. /*
  1158. ** Add a capture instruction:
  1159. ** 'op' is the capture instruction; 'cap' the capture kind;
  1160. ** 'key' the key into ktable; 'aux' is optional offset
  1161. **
  1162. */
  1163. static int addinstcap (CompileState *compst, Opcode op, int cap, int key,
  1164. int aux) {
  1165. int i = addinstruction(compst, op, joinkindoff(cap, aux));
  1166. getinstr(compst, i).i.key = key;
  1167. return i;
  1168. }
  1169. #define gethere(compst) ((compst)->ncode)
  1170. #define target(code,i) ((i) + code[i + 1].offset)
  1171. static void jumptothere (CompileState *compst, int instruction, int target) {
  1172. if (instruction >= 0)
  1173. setoffset(compst, instruction, target - instruction);
  1174. }
  1175. static void jumptohere (CompileState *compst, int instruction) {
  1176. jumptothere(compst, instruction, gethere(compst));
  1177. }
  1178. /*
  1179. ** Code an IChar instruction, or IAny if there is an equivalent
  1180. ** test dominating it
  1181. */
  1182. static void codechar (CompileState *compst, int c, int tt) {
  1183. if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar &&
  1184. getinstr(compst, tt).i.aux == c)
  1185. addinstruction(compst, IAny, 0);
  1186. else
  1187. addinstruction(compst, IChar, c);
  1188. }
  1189. /*
  1190. ** Add a charset posfix to an instruction
  1191. */
  1192. static void addcharset (CompileState *compst, const byte *cs) {
  1193. int p = gethere(compst);
  1194. int i;
  1195. for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
  1196. nextinstruction(compst); /* space for buffer */
  1197. /* fill buffer with charset */
  1198. loopset(j, getinstr(compst, p).buff[j] = cs[j]);
  1199. }
  1200. /*
  1201. ** code a char set, optimizing unit sets for IChar, "complete"
  1202. ** sets for IAny, and empty sets for IFail; also use an IAny
  1203. ** when instruction is dominated by an equivalent test.
  1204. */
  1205. static void codecharset (CompileState *compst, const byte *cs, int tt) {
  1206. int c = 0; /* (=) to avoid warnings */
  1207. Opcode op = charsettype(cs, &c);
  1208. switch (op) {
  1209. case IChar: codechar(compst, c, tt); break;
  1210. case ISet: { /* non-trivial set? */
  1211. if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet &&
  1212. cs_equal(cs, getinstr(compst, tt + 2).buff))
  1213. addinstruction(compst, IAny, 0);
  1214. else {
  1215. addinstruction(compst, ISet, 0);
  1216. addcharset(compst, cs);
  1217. }
  1218. break;
  1219. }
  1220. default: addinstruction(compst, op, c); break;
  1221. }
  1222. }
  1223. /*
  1224. ** code a test set, optimizing unit sets for ITestChar, "complete"
  1225. ** sets for ITestAny, and empty sets for IJmp (always fails).
  1226. ** 'e' is true iff test should accept the empty string. (Test
  1227. ** instructions in the current VM never accept the empty string.)
  1228. */
  1229. static int codetestset (CompileState *compst, Charset *cs, int e) {
  1230. if (e) return NOINST; /* no test */
  1231. else {
  1232. int c = 0;
  1233. Opcode op = charsettype(cs->cs, &c);
  1234. switch (op) {
  1235. case IFail: return addoffsetinst(compst, IJmp); /* always jump */
  1236. case IAny: return addoffsetinst(compst, ITestAny);
  1237. case IChar: {
  1238. int i = addoffsetinst(compst, ITestChar);
  1239. getinstr(compst, i).i.aux = c;
  1240. return i;
  1241. }
  1242. case ISet: {
  1243. int i = addoffsetinst(compst, ITestSet);
  1244. addcharset(compst, cs->cs);
  1245. return i;
  1246. }
  1247. default: assert(0); return 0;
  1248. }
  1249. }
  1250. }
  1251. /*
  1252. ** Find the final destination of a sequence of jumps
  1253. */
  1254. static int finaltarget (Instruction *code, int i) {
  1255. while (code[i].i.code == IJmp)
  1256. i = target(code, i);
  1257. return i;
  1258. }
  1259. /*
  1260. ** final label (after traversing any jumps)
  1261. */
  1262. static int finallabel (Instruction *code, int i) {
  1263. return finaltarget(code, target(code, i));
  1264. }
  1265. /*
  1266. ** <behind(p)> == behind n; <p> (where n = fixedlen(p))
  1267. */
  1268. static void codebehind (CompileState *compst, TTree *tree) {
  1269. if (tree->u.n > 0)
  1270. addinstruction(compst, IBehind, tree->u.n);
  1271. codegen(compst, sib1(tree), 0, NOINST, fullset);
  1272. }
  1273. /*
  1274. ** Choice; optimizations:
  1275. ** - when p1 is headfail
  1276. ** - when first(p1) and first(p2) are disjoint; than
  1277. ** a character not in first(p1) cannot go to p1, and a character
  1278. ** in first(p1) cannot go to p2 (at it is not in first(p2)).
  1279. ** (The optimization is not valid if p1 accepts the empty string,
  1280. ** as then there is no character at all...)
  1281. ** - when p2 is empty and opt is true; a IPartialCommit can resuse
  1282. ** the Choice already active in the stack.
  1283. */
  1284. static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
  1285. const Charset *fl) {
  1286. int emptyp2 = (p2->tag == TTrue);
  1287. Charset cs1, cs2;
  1288. int e1 = getfirst(p1, fullset, &cs1);
  1289. if (headfail(p1) ||
  1290. (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) {
  1291. /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */
  1292. int test = codetestset(compst, &cs1, 0);
  1293. int jmp = NOINST;
  1294. codegen(compst, p1, 0, test, fl);
  1295. if (!emptyp2)
  1296. jmp = addoffsetinst(compst, IJmp);
  1297. jumptohere(compst, test);
  1298. codegen(compst, p2, opt, NOINST, fl);
  1299. jumptohere(compst, jmp);
  1300. }
  1301. else if (opt && emptyp2) {
  1302. /* p1? == IPartialCommit; p1 */
  1303. jumptohere(compst, addoffsetinst(compst, IPartialCommit));
  1304. codegen(compst, p1, 1, NOINST, fullset);
  1305. }
  1306. else {
  1307. /* <p1 / p2> ==
  1308. test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
  1309. int pcommit;
  1310. int test = codetestset(compst, &cs1, e1);
  1311. int pchoice = addoffsetinst(compst, IChoice);
  1312. codegen(compst, p1, emptyp2, test, fullset);
  1313. pcommit = addoffsetinst(compst, ICommit);
  1314. jumptohere(compst, pchoice);
  1315. jumptohere(compst, test);
  1316. codegen(compst, p2, opt, NOINST, fl);
  1317. jumptohere(compst, pcommit);
  1318. }
  1319. }
  1320. /*
  1321. ** And predicate
  1322. ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
  1323. ** (valid only when 'p' has no captures)
  1324. */
  1325. static void codeand (CompileState *compst, TTree *tree, int tt) {
  1326. int n = fixedlen(tree);
  1327. if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) {
  1328. codegen(compst, tree, 0, tt, fullset);
  1329. if (n > 0)
  1330. addinstruction(compst, IBehind, n);
  1331. }
  1332. else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */
  1333. int pcommit;
  1334. int pchoice = addoffsetinst(compst, IChoice);
  1335. codegen(compst, tree, 0, tt, fullset);
  1336. pcommit = addoffsetinst(compst, IBackCommit);
  1337. jumptohere(compst, pchoice);
  1338. addinstruction(compst, IFail, 0);
  1339. jumptohere(compst, pcommit);
  1340. }
  1341. }
  1342. /*
  1343. ** Captures: if pattern has fixed (and not too big) length, use
  1344. ** a single IFullCapture instruction after the match; otherwise,
  1345. ** enclose the pattern with OpenCapture - CloseCapture.
  1346. */
  1347. static void codecapture (CompileState *compst, TTree *tree, int tt,
  1348. const Charset *fl) {
  1349. int len = fixedlen(sib1(tree));
  1350. if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) {
  1351. codegen(compst, sib1(tree), 0, tt, fl);
  1352. addinstcap(compst, IFullCapture, tree->cap, tree->key, len);
  1353. }
  1354. else {
  1355. addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0);
  1356. codegen(compst, sib1(tree), 0, tt, fl);
  1357. addinstcap(compst, ICloseCapture, Cclose, 0, 0);
  1358. }
  1359. }
  1360. static void coderuntime (CompileState *compst, TTree *tree, int tt) {
  1361. addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0);
  1362. codegen(compst, sib1(tree), 0, tt, fullset);
  1363. addinstcap(compst, ICloseRunTime, Cclose, 0, 0);
  1364. }
  1365. /*
  1366. ** Repetion; optimizations:
  1367. ** When pattern is a charset, can use special instruction ISpan.
  1368. ** When pattern is head fail, or if it starts with characters that
  1369. ** are disjoint from what follows the repetions, a simple test
  1370. ** is enough (a fail inside the repetition would backtrack to fail
  1371. ** again in the following pattern, so there is no need for a choice).
  1372. ** When 'opt' is true, the repetion can reuse the Choice already
  1373. ** active in the stack.
  1374. */
  1375. static void coderep (CompileState *compst, TTree *tree, int opt,
  1376. const Charset *fl) {
  1377. Charset st;
  1378. if (tocharset(tree, &st)) {
  1379. addinstruction(compst, ISpan, 0);
  1380. addcharset(compst, st.cs);
  1381. }
  1382. else {
  1383. int e1 = getfirst(tree, fullset, &st);
  1384. if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
  1385. /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
  1386. int jmp;
  1387. int test = codetestset(compst, &st, 0);
  1388. codegen(compst, tree, opt, test, fullset);
  1389. jmp = addoffsetinst(compst, IJmp);
  1390. jumptohere(compst, test);
  1391. jumptothere(compst, jmp, test);
  1392. }
  1393. else {
  1394. /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */
  1395. /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */
  1396. int commit, l2;
  1397. int test = codetestset(compst, &st, e1);
  1398. int pchoice = NOINST;
  1399. if (opt)
  1400. jumptohere(compst, addoffsetinst(compst, IPartialCommit));
  1401. else
  1402. pchoice = addoffsetinst(compst, IChoice);
  1403. l2 = gethere(compst);
  1404. codegen(compst, tree, 0, NOINST, fullset);
  1405. commit = addoffsetinst(compst, IPartialCommit);
  1406. jumptothere(compst, commit, l2);
  1407. jumptohere(compst, pchoice);
  1408. jumptohere(compst, test);
  1409. }
  1410. }
  1411. }
  1412. /*
  1413. ** Not predicate; optimizations:
  1414. ** In any case, if first test fails, 'not' succeeds, so it can jump to
  1415. ** the end. If pattern is headfail, that is all (it cannot fail
  1416. ** in other parts); this case includes 'not' of simple sets. Otherwise,
  1417. ** use the default code (a choice plus a failtwice).
  1418. */
  1419. static void codenot (CompileState *compst, TTree *tree) {
  1420. Charset st;
  1421. int e = getfirst(tree, fullset, &st);
  1422. int test = codetestset(compst, &st, e);
  1423. if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */
  1424. addinstruction(compst, IFail, 0);
  1425. else {
  1426. /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */
  1427. int pchoice = addoffsetinst(compst, IChoice);
  1428. codegen(compst, tree, 0, NOINST, fullset);
  1429. addinstruction(compst, IFailTwice, 0);
  1430. jumptohere(compst, pchoice);
  1431. }
  1432. jumptohere(compst, test);
  1433. }
  1434. /*
  1435. ** change open calls to calls, using list 'positions' to find
  1436. ** correct offsets; also optimize tail calls
  1437. */
  1438. static void correctcalls (CompileState *compst, int *positions,
  1439. int from, int to) {
  1440. int i;
  1441. Instruction *code = compst->p->code;
  1442. for (i = from; i < to; i += sizei(&code[i])) {
  1443. if (code[i].i.code == IOpenCall) {
  1444. int n = code[i].i.key; /* rule number */
  1445. int rule = positions[n]; /* rule position */
  1446. assert(rule == from || code[rule - 1].i.code == IRet);
  1447. if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */
  1448. code[i].i.code = IJmp; /* tail call */
  1449. else
  1450. code[i].i.code = ICall;
  1451. jumptothere(compst, i, rule); /* call jumps to respective rule */
  1452. }
  1453. }
  1454. assert(i == to);
  1455. }
  1456. /*
  1457. ** Code for a grammar:
  1458. ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
  1459. */
  1460. static void codegrammar (CompileState *compst, TTree *grammar) {
  1461. int positions[MAXRULES];
  1462. int rulenumber = 0;
  1463. TTree *rule;
  1464. int firstcall = addoffsetinst(compst, ICall); /* call initial rule */
  1465. int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */
  1466. int start = gethere(compst); /* here starts the initial rule */
  1467. jumptohere(compst, firstcall);
  1468. for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
  1469. positions[rulenumber++] = gethere(compst); /* save rule position */
  1470. codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */
  1471. addinstruction(compst, IRet, 0);
  1472. }
  1473. assert(rule->tag == TTrue);
  1474. jumptohere(compst, jumptoend);
  1475. correctcalls(compst, positions, start, gethere(compst));
  1476. }
  1477. static void codecall (CompileState

Large files files are truncated, but you can click here to view the full file