/lpeg.c
C | 3706 lines | 2569 code | 513 blank | 624 comment | 489 complexity | 8375c196726095a13fc4ca029271b715 MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, Apache-2.0, LGPL-2.0, LGPL-2.1, MIT
Large files files are truncated, but you can click here to view the full file
- /*
- ** $Id: lptypes.h,v 1.8 2013/04/12 16:26:38 roberto Exp $
- ** LPeg - PEG pattern matching for Lua
- ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
- ** written by Roberto Ierusalimschy
- */
- #if !defined(lptypes_h)
- #define lptypes_h
- #if !defined(LPEG_DEBUG)
- #define NDEBUG
- #endif
- #include <assert.h>
- #include <limits.h>
- #include "lua.h"
- #define VERSION "0.12"
- #define PATTERN_T "lpeg-pattern"
- #define MAXSTACKIDX "lpeg-maxstack"
- /*
- ** compatibility with Lua 5.2
- */
- #if (LUA_VERSION_NUM == 502)
- #undef lua_equal
- #define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ)
- #undef lua_getfenv
- #define lua_getfenv lua_getuservalue
- #undef lua_setfenv
- #define lua_setfenv lua_setuservalue
- #undef lua_objlen
- #define lua_objlen lua_rawlen
- #undef luaL_register
- #define luaL_register(L,n,f) \
- { if ((n) == NULL) luaL_setfuncs(L,f,0); else luaL_newlib(L,f); }
- #endif
- /* default maximum size for call/backtrack stack */
- #if !defined(MAXBACK)
- #define MAXBACK 100
- #endif
- /* maximum number of rules in a grammar */
- #define MAXRULES 200
- /* initial size for capture's list */
- #define INITCAPSIZE 32
- /* index, on Lua stack, for subject */
- #define SUBJIDX 2
- /* number of fixed arguments to 'match' (before capture arguments) */
- #define FIXEDARGS 3
- /* index, on Lua stack, for capture list */
- #define caplistidx(ptop) ((ptop) + 2)
- /* index, on Lua stack, for pattern's ktable */
- #define ktableidx(ptop) ((ptop) + 3)
- /* index, on Lua stack, for backtracking stack */
- #define stackidx(ptop) ((ptop) + 4)
- typedef unsigned char byte;
- #define BITSPERCHAR 8
- #define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1)
- typedef struct Charset {
- byte cs[CHARSETSIZE];
- } Charset;
- #define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} }
- /* access to charset */
- #define treebuffer(t) ((byte *)((t) + 1))
- /* number of slots needed for 'n' bytes */
- #define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1)
- /* set 'b' bit in charset 'cs' */
- #define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7)))
- /*
- ** in capture instructions, 'kind' of capture and its offset are
- ** packed in field 'aux', 4 bits for each
- */
- #define getkind(op) ((op)->i.aux & 0xF)
- #define getoff(op) (((op)->i.aux >> 4) & 0xF)
- #define joinkindoff(k,o) ((k) | ((o) << 4))
- #define MAXOFF 0xF
- #define MAXAUX 0xFF
- /* maximum number of bytes to look behind */
- #define MAXBEHIND MAXAUX
- /* maximum size (in elements) for a pattern */
- #define MAXPATTSIZE (SHRT_MAX - 10)
- /* size (in elements) for an instruction plus extra l bytes */
- #define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1)
- /* size (in elements) for a ISet instruction */
- #define CHARSETINSTSIZE instsize(CHARSETSIZE)
- /* size (in elements) for a IFunc instruction */
- #define funcinstsize(p) ((p)->i.aux + 2)
- #define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7))))
- #endif
- /*
- ** $Id: lptree.h,v 1.2 2013/03/24 13:51:12 roberto Exp $
- */
- #if !defined(lptree_h)
- #define lptree_h
- /*
- ** types of trees
- */
- typedef enum TTag {
- TChar = 0, TSet, TAny, /* standard PEG elements */
- TTrue, TFalse,
- TRep,
- TSeq, TChoice,
- TNot, TAnd,
- TCall,
- TOpenCall,
- TRule, /* sib1 is rule's pattern, sib2 is 'next' rule */
- TGrammar, /* sib1 is initial (and first) rule */
- TBehind, /* match behind */
- TCapture, /* regular capture */
- TRunTime /* run-time capture */
- } TTag;
- /* number of siblings for each tree */
- extern const byte numsiblings[];
- /*
- ** Tree trees
- ** The first sibling of a tree (if there is one) is immediately after
- ** the tree. A reference to a second sibling (ps) is its position
- ** relative to the position of the tree itself. A key in ktable
- ** uses the (unique) address of the original tree that created that
- ** entry. NULL means no data.
- */
- typedef struct TTree {
- byte tag;
- byte cap; /* kind of capture (if it is a capture) */
- unsigned short key; /* key in ktable for Lua data (0 if no key) */
- union {
- int ps; /* occasional second sibling */
- int n; /* occasional counter */
- } u;
- } TTree;
- /*
- ** A complete pattern has its tree plus, if already compiled,
- ** its corresponding code
- */
- typedef struct Pattern {
- union Instruction *code;
- int codesize;
- TTree tree[1];
- } Pattern;
- /* number of siblings for each tree */
- extern const byte numsiblings[];
- /* access to siblings */
- #define sib1(t) ((t) + 1)
- #define sib2(t) ((t) + (t)->u.ps)
- #endif
- /*
- ** $Id: lpcap.h,v 1.1 2013/03/21 20:25:12 roberto Exp $
- */
- #if !defined(lpcap_h)
- #define lpcap_h
- /* kinds of captures */
- typedef enum CapKind {
- Cclose, Cposition, Cconst, Cbackref, Carg, Csimple, Ctable, Cfunction,
- Cquery, Cstring, Cnum, Csubst, Cfold, Cruntime, Cgroup
- } CapKind;
- typedef struct Capture {
- const char *s; /* subject position */
- short idx; /* extra info about capture (group name, arg index, etc.) */
- byte kind; /* kind of capture */
- byte siz; /* size of full capture + 1 (0 = not a full capture) */
- } Capture;
- typedef struct CapState {
- Capture *cap; /* current capture */
- Capture *ocap; /* (original) capture list */
- lua_State *L;
- int ptop; /* index of last argument to 'match' */
- const char *s; /* original string */
- int valuecached; /* value stored in cache slot */
- } CapState;
- int runtimecap (CapState *cs, Capture *close, const char *s, int *rem);
- int getcaptures (lua_State *L, const char *s, const char *r, int ptop);
- int finddyncap (Capture *cap, Capture *last);
- #endif
- /*
- ** $Id: lpvm.h,v 1.2 2013/04/03 20:37:18 roberto Exp $
- */
- #if !defined(lpvm_h)
- #define lpvm_h
- /* Virtual Machine's instructions */
- typedef enum Opcode {
- IAny, /* if no char, fail */
- IChar, /* if char != aux, fail */
- ISet, /* if char not in buff, fail */
- ITestAny, /* in no char, jump to 'offset' */
- ITestChar, /* if char != aux, jump to 'offset' */
- ITestSet, /* if char not in buff, jump to 'offset' */
- ISpan, /* read a span of chars in buff */
- IBehind, /* walk back 'aux' characters (fail if not possible) */
- IRet, /* return from a rule */
- IEnd, /* end of pattern */
- IChoice, /* stack a choice; next fail will jump to 'offset' */
- IJmp, /* jump to 'offset' */
- ICall, /* call rule at 'offset' */
- IOpenCall, /* call rule number 'key' (must be closed to a ICall) */
- ICommit, /* pop choice and jump to 'offset' */
- IPartialCommit, /* update top choice to current position and jump */
- IBackCommit, /* "fails" but jump to its own 'offset' */
- IFailTwice, /* pop one choice and then fail */
- IFail, /* go back to saved state on choice and jump to saved offset */
- IGiveup, /* internal use */
- IFullCapture, /* complete capture of last 'off' chars */
- IOpenCapture, /* start a capture */
- ICloseCapture,
- ICloseRunTime
- } Opcode;
- typedef union Instruction {
- struct Inst {
- byte code;
- byte aux;
- short key;
- } i;
- int offset;
- byte buff[1];
- } Instruction;
- int getposition (lua_State *L, int t, int i);
- void printpatt (Instruction *p, int n);
- const char *match (lua_State *L, const char *o, const char *s, const char *e,
- Instruction *op, Capture *capture, int ptop);
- int verify (lua_State *L, Instruction *op, const Instruction *p,
- Instruction *e, int postable, int rule);
- void checkrule (lua_State *L, Instruction *op, int from, int to,
- int postable, int rule);
- #endif
- /*
- ** $Id: lpcode.h,v 1.5 2013/04/04 21:24:45 roberto Exp $
- */
- #if !defined(lpcode_h)
- #define lpcode_h
- #include "lua.h"
- int tocharset (TTree *tree, Charset *cs);
- int checkaux (TTree *tree, int pred);
- int fixedlenx (TTree *tree, int count, int len);
- int hascaptures (TTree *tree);
- int lp_gc (lua_State *L);
- Instruction *compile (lua_State *L, Pattern *p);
- void reallocprog (lua_State *L, Pattern *p, int nsize);
- int sizei (const Instruction *i);
- #define PEnullable 0
- #define PEnofail 1
- #define nofail(t) checkaux(t, PEnofail)
- #define nullable(t) checkaux(t, PEnullable)
- #define fixedlen(t) fixedlenx(t, 0, 0)
- #endif
- /*
- ** $Id: lpprint.h,v 1.1 2013/03/21 20:25:12 roberto Exp $
- */
- #if !defined(lpprint_h)
- #define lpprint_h
- #if defined(LPEG_DEBUG)
- void printpatt (Instruction *p, int n);
- void printtree (TTree *tree, int ident);
- void printktable (lua_State *L, int idx);
- void printcharset (const byte *st);
- void printcaplist (Capture *cap, Capture *limit);
- #else
- #define printktable(L,idx) \
- luaL_error(L, "function only implemented in debug mode")
- #define printtree(tree,i) \
- luaL_error(L, "function only implemented in debug mode")
- #define printpatt(p,n) \
- luaL_error(L, "function only implemented in debug mode")
- #endif
- #endif
- /*
- ** $Id: lpcap.c,v 1.4 2013/03/21 20:25:12 roberto Exp $
- ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
- */
- #include "lua.h"
- #include "lauxlib.h"
- #define captype(cap) ((cap)->kind)
- #define isclosecap(cap) (captype(cap) == Cclose)
- #define closeaddr(c) ((c)->s + (c)->siz - 1)
- #define isfullcap(cap) ((cap)->siz != 0)
- #define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v)
- #define pushluaval(cs) getfromktable(cs, (cs)->cap->idx)
- /*
- ** Put at the cache for Lua values the value indexed by 'v' in ktable
- ** of the running pattern (if it is not there yet); returns its index.
- */
- static int updatecache (CapState *cs, int v) {
- int idx = cs->ptop + 1; /* stack index of cache for Lua values */
- if (v != cs->valuecached) { /* not there? */
- getfromktable(cs, v); /* get value from 'ktable' */
- lua_replace(cs->L, idx); /* put it at reserved stack position */
- cs->valuecached = v; /* keep track of what is there */
- }
- return idx;
- }
- static int pushcapture (CapState *cs);
- /*
- ** Goes back in a list of captures looking for an open capture
- ** corresponding to a close
- */
- static Capture *findopen (Capture *cap) {
- int n = 0; /* number of closes waiting an open */
- for (;;) {
- cap--;
- if (isclosecap(cap)) n++; /* one more open to skip */
- else if (!isfullcap(cap))
- if (n-- == 0) return cap;
- }
- }
- /*
- ** Go to the next capture
- */
- static void nextcap (CapState *cs) {
- Capture *cap = cs->cap;
- if (!isfullcap(cap)) { /* not a single capture? */
- int n = 0; /* number of opens waiting a close */
- for (;;) { /* look for corresponding close */
- cap++;
- if (isclosecap(cap)) {
- if (n-- == 0) break;
- }
- else if (!isfullcap(cap)) n++;
- }
- }
- cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */
- }
- /*
- ** Push on the Lua stack all values generated by nested captures inside
- ** the current capture. Returns number of values pushed. 'addextra'
- ** makes it push the entire match after all captured values. The
- ** entire match is pushed also if there are no other nested values,
- ** so the function never returns zero.
- */
- static int pushnestedvalues (CapState *cs, int addextra) {
- Capture *co = cs->cap;
- if (isfullcap(cs->cap++)) { /* no nested captures? */
- lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */
- return 1; /* that is it */
- }
- else {
- int n = 0;
- while (!isclosecap(cs->cap)) /* repeat for all nested patterns */
- n += pushcapture(cs);
- if (addextra || n == 0) { /* need extra? */
- lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */
- n++;
- }
- cs->cap++; /* skip close entry */
- return n;
- }
- }
- /*
- ** Push only the first value generated by nested captures
- */
- static void pushonenestedvalue (CapState *cs) {
- int n = pushnestedvalues(cs, 0);
- if (n > 1)
- lua_pop(cs->L, n - 1); /* pop extra values */
- }
- /*
- ** Try to find a named group capture with the name given at the top of
- ** the stack; goes backward from 'cap'.
- */
- static Capture *findback (CapState *cs, Capture *cap) {
- lua_State *L = cs->L;
- while (cap-- > cs->ocap) { /* repeat until end of list */
- if (isclosecap(cap))
- cap = findopen(cap); /* skip nested captures */
- else if (!isfullcap(cap))
- continue; /* opening an enclosing capture: skip and get previous */
- if (captype(cap) == Cgroup) {
- getfromktable(cs, cap->idx); /* get group name */
- if (lua_equal(L, -2, -1)) { /* right group? */
- lua_pop(L, 2); /* remove reference name and group name */
- return cap;
- }
- else lua_pop(L, 1); /* remove group name */
- }
- }
- luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1));
- return NULL; /* to avoid warnings */
- }
- /*
- ** Back-reference capture. Return number of values pushed.
- */
- static int backrefcap (CapState *cs) {
- int n;
- Capture *curr = cs->cap;
- pushluaval(cs); /* reference name */
- cs->cap = findback(cs, curr); /* find corresponding group */
- n = pushnestedvalues(cs, 0); /* push group's values */
- cs->cap = curr + 1;
- return n;
- }
- /*
- ** Table capture: creates a new table and populates it with nested
- ** captures.
- */
- static int tablecap (CapState *cs) {
- lua_State *L = cs->L;
- int n = 0;
- lua_newtable(L);
- if (isfullcap(cs->cap++))
- return 1; /* table is empty */
- while (!isclosecap(cs->cap)) {
- if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */
- pushluaval(cs); /* push group name */
- pushonenestedvalue(cs);
- lua_settable(L, -3);
- }
- else { /* not a named group */
- int i;
- int k = pushcapture(cs);
- for (i = k; i > 0; i--) /* store all values into table */
- lua_rawseti(L, -(i + 1), n + i);
- n += k;
- }
- }
- cs->cap++; /* skip close entry */
- return 1; /* number of values pushed (only the table) */
- }
- /*
- ** Table-query capture
- */
- static int querycap (CapState *cs) {
- int idx = cs->cap->idx;
- pushonenestedvalue(cs); /* get nested capture */
- lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */
- if (!lua_isnil(cs->L, -1))
- return 1;
- else { /* no value */
- lua_pop(cs->L, 1); /* remove nil */
- return 0;
- }
- }
- /*
- ** Fold capture
- */
- static int foldcap (CapState *cs) {
- int n;
- lua_State *L = cs->L;
- int idx = cs->cap->idx;
- if (isfullcap(cs->cap++) || /* no nested captures? */
- isclosecap(cs->cap) || /* no nested captures (large subject)? */
- (n = pushcapture(cs)) == 0) /* nested captures with no values? */
- return luaL_error(L, "no initial value for fold capture");
- if (n > 1)
- lua_pop(L, n - 1); /* leave only one result for accumulator */
- while (!isclosecap(cs->cap)) {
- lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */
- lua_insert(L, -2); /* put it before accumulator */
- n = pushcapture(cs); /* get next capture's values */
- lua_call(L, n + 1, 1); /* call folding function */
- }
- cs->cap++; /* skip close entry */
- return 1; /* only accumulator left on the stack */
- }
- /*
- ** Function capture
- */
- static int functioncap (CapState *cs) {
- int n;
- int top = lua_gettop(cs->L);
- pushluaval(cs); /* push function */
- n = pushnestedvalues(cs, 0); /* push nested captures */
- lua_call(cs->L, n, LUA_MULTRET); /* call function */
- return lua_gettop(cs->L) - top; /* return function's results */
- }
- /*
- ** Select capture
- */
- static int numcap (CapState *cs) {
- int idx = cs->cap->idx; /* value to select */
- if (idx == 0) { /* no values? */
- nextcap(cs); /* skip entire capture */
- return 0; /* no value produced */
- }
- else {
- int n = pushnestedvalues(cs, 0);
- if (n < idx) /* invalid index? */
- return luaL_error(cs->L, "no capture '%d'", idx);
- else {
- lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */
- lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */
- lua_pop(cs->L, n - 1); /* remove other captures */
- return 1;
- }
- }
- }
- /*
- ** Return the stack index of the first runtime capture in the given
- ** list of captures (or zero if no runtime captures)
- */
- int finddyncap (Capture *cap, Capture *last) {
- for (; cap < last; cap++) {
- if (cap->kind == Cruntime)
- return cap->idx; /* stack position of first capture */
- }
- return 0; /* no dynamic captures in this segment */
- }
- /*
- ** Calls a runtime capture. Returns number of captures removed by
- ** the call, including the initial Cgroup. (Captures to be added are
- ** on the Lua stack.)
- */
- int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) {
- int n, id;
- lua_State *L = cs->L;
- int otop = lua_gettop(L);
- Capture *open = findopen(close);
- assert(captype(open) == Cgroup);
- id = finddyncap(open, close); /* get first dynamic capture argument */
- close->kind = Cclose; /* closes the group */
- close->s = s;
- cs->cap = open; cs->valuecached = 0; /* prepare capture state */
- luaL_checkstack(L, 4, "too many runtime captures");
- pushluaval(cs); /* push function to be called */
- lua_pushvalue(L, SUBJIDX); /* push original subject */
- lua_pushinteger(L, s - cs->s + 1); /* push current position */
- n = pushnestedvalues(cs, 0); /* push nested captures */
- lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */
- if (id > 0) { /* are there old dynamic captures to be removed? */
- int i;
- for (i = id; i <= otop; i++)
- lua_remove(L, id); /* remove old dynamic captures */
- *rem = otop - id + 1; /* total number of dynamic captures removed */
- }
- else
- *rem = 0; /* no dynamic captures removed */
- return close - open; /* number of captures of all kinds removed */
- }
- /*
- ** Auxiliary structure for substitution and string captures: keep
- ** information about nested captures for future use, avoiding to push
- ** string results into Lua
- */
- typedef struct StrAux {
- int isstring; /* whether capture is a string */
- union {
- Capture *cp; /* if not a string, respective capture */
- struct { /* if it is a string... */
- const char *s; /* ... starts here */
- const char *e; /* ... ends here */
- } s;
- } u;
- } StrAux;
- #define MAXSTRCAPS 10
- /*
- ** Collect values from current capture into array 'cps'. Current
- ** capture must be Cstring (first call) or Csimple (recursive calls).
- ** (In first call, fills %0 with whole match for Cstring.)
- ** Returns number of elements in the array that were filled.
- */
- static int getstrcaps (CapState *cs, StrAux *cps, int n) {
- int k = n++;
- cps[k].isstring = 1; /* get string value */
- cps[k].u.s.s = cs->cap->s; /* starts here */
- if (!isfullcap(cs->cap++)) { /* nested captures? */
- while (!isclosecap(cs->cap)) { /* traverse them */
- if (n >= MAXSTRCAPS) /* too many captures? */
- nextcap(cs); /* skip extra captures (will not need them) */
- else if (captype(cs->cap) == Csimple) /* string? */
- n = getstrcaps(cs, cps, n); /* put info. into array */
- else {
- cps[n].isstring = 0; /* not a string */
- cps[n].u.cp = cs->cap; /* keep original capture */
- nextcap(cs);
- n++;
- }
- }
- cs->cap++; /* skip close */
- }
- cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */
- return n;
- }
- /*
- ** add next capture value (which should be a string) to buffer 'b'
- */
- static int addonestring (luaL_Buffer *b, CapState *cs, const char *what);
- /*
- ** String capture: add result to buffer 'b' (instead of pushing
- ** it into the stack)
- */
- static void stringcap (luaL_Buffer *b, CapState *cs) {
- StrAux cps[MAXSTRCAPS];
- int n;
- size_t len, i;
- const char *fmt; /* format string */
- fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len);
- n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */
- for (i = 0; i < len; i++) { /* traverse them */
- if (fmt[i] != '%') /* not an escape? */
- luaL_addchar(b, fmt[i]); /* add it to buffer */
- else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */
- luaL_addchar(b, fmt[i]); /* add to buffer */
- else {
- int l = fmt[i] - '0'; /* capture index */
- if (l > n)
- luaL_error(cs->L, "invalid capture index (%d)", l);
- else if (cps[l].isstring)
- luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s);
- else {
- Capture *curr = cs->cap;
- cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */
- if (!addonestring(b, cs, "capture"))
- luaL_error(cs->L, "no values in capture index %d", l);
- cs->cap = curr; /* continue from where it stopped */
- }
- }
- }
- }
- /*
- ** Substitution capture: add result to buffer 'b'
- */
- static void substcap (luaL_Buffer *b, CapState *cs) {
- const char *curr = cs->cap->s;
- if (isfullcap(cs->cap)) /* no nested captures? */
- luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */
- else {
- cs->cap++; /* skip open entry */
- while (!isclosecap(cs->cap)) { /* traverse nested captures */
- const char *next = cs->cap->s;
- luaL_addlstring(b, curr, next - curr); /* add text up to capture */
- if (addonestring(b, cs, "replacement"))
- curr = closeaddr(cs->cap - 1); /* continue after match */
- else /* no capture value */
- curr = next; /* keep original text in final result */
- }
- luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */
- }
- cs->cap++; /* go to next capture */
- }
- /*
- ** Evaluates a capture and adds its first value to buffer 'b'; returns
- ** whether there was a value
- */
- static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) {
- switch (captype(cs->cap)) {
- case Cstring:
- stringcap(b, cs); /* add capture directly to buffer */
- return 1;
- case Csubst:
- substcap(b, cs); /* add capture directly to buffer */
- return 1;
- default: {
- lua_State *L = cs->L;
- int n = pushcapture(cs);
- if (n > 0) {
- if (n > 1) lua_pop(L, n - 1); /* only one result */
- if (!lua_isstring(L, -1))
- luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1));
- luaL_addvalue(b);
- }
- return n;
- }
- }
- }
- /*
- ** Push all values of the current capture into the stack; returns
- ** number of values pushed
- */
- static int pushcapture (CapState *cs) {
- lua_State *L = cs->L;
- luaL_checkstack(L, 4, "too many captures");
- switch (captype(cs->cap)) {
- case Cposition: {
- lua_pushinteger(L, cs->cap->s - cs->s + 1);
- cs->cap++;
- return 1;
- }
- case Cconst: {
- pushluaval(cs);
- cs->cap++;
- return 1;
- }
- case Carg: {
- int arg = (cs->cap++)->idx;
- if (arg + FIXEDARGS > cs->ptop)
- return luaL_error(L, "reference to absent argument #%d", arg);
- lua_pushvalue(L, arg + FIXEDARGS);
- return 1;
- }
- case Csimple: {
- int k = pushnestedvalues(cs, 1);
- lua_insert(L, -k); /* make whole match be first result */
- return k;
- }
- case Cruntime: {
- lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */
- return 1;
- }
- case Cstring: {
- luaL_Buffer b;
- luaL_buffinit(L, &b);
- stringcap(&b, cs);
- luaL_pushresult(&b);
- return 1;
- }
- case Csubst: {
- luaL_Buffer b;
- luaL_buffinit(L, &b);
- substcap(&b, cs);
- luaL_pushresult(&b);
- return 1;
- }
- case Cgroup: {
- if (cs->cap->idx == 0) /* anonymous group? */
- return pushnestedvalues(cs, 0); /* add all nested values */
- else { /* named group: add no values */
- nextcap(cs); /* skip capture */
- return 0;
- }
- }
- case Cbackref: return backrefcap(cs);
- case Ctable: return tablecap(cs);
- case Cfunction: return functioncap(cs);
- case Cnum: return numcap(cs);
- case Cquery: return querycap(cs);
- case Cfold: return foldcap(cs);
- default: assert(0); return 0;
- }
- }
- /*
- ** Prepare a CapState structure and traverse the entire list of
- ** captures in the stack pushing its results. 's' is the subject
- ** string, 'r' is the final position of the match, and 'ptop'
- ** the index in the stack where some useful values were pushed.
- ** Returns the number of results pushed. (If the list produces no
- ** results, push the final position of the match.)
- */
- int getcaptures (lua_State *L, const char *s, const char *r, int ptop) {
- Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop));
- int n = 0;
- if (!isclosecap(capture)) { /* is there any capture? */
- CapState cs;
- cs.ocap = cs.cap = capture; cs.L = L;
- cs.s = s; cs.valuecached = 0; cs.ptop = ptop;
- do { /* collect their values */
- n += pushcapture(&cs);
- } while (!isclosecap(cs.cap));
- }
- if (n == 0) { /* no capture values? */
- lua_pushinteger(L, r - s + 1); /* return only end position */
- n = 1;
- }
- return n;
- }
- /*
- ** $Id: lpcode.c,v 1.18 2013/04/12 16:30:33 roberto Exp $
- ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
- */
- #include <limits.h>
- #include "lua.h"
- #include "lauxlib.h"
- /* signals a "no-instruction */
- #define NOINST -1
- static const Charset fullset_ =
- {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}};
- static const Charset *fullset = &fullset_;
- /*
- ** {======================================================
- ** Analysis and some optimizations
- ** =======================================================
- */
- /*
- ** Check whether a charset is empty (IFail), singleton (IChar),
- ** full (IAny), or none of those (ISet).
- */
- static Opcode charsettype (const byte *cs, int *c) {
- int count = 0;
- int i;
- int candidate = -1; /* candidate position for a char */
- for (i = 0; i < CHARSETSIZE; i++) {
- int b = cs[i];
- if (b == 0) {
- if (count > 1) return ISet; /* else set is still empty */
- }
- else if (b == 0xFF) {
- if (count < (i * BITSPERCHAR))
- return ISet;
- else count += BITSPERCHAR; /* set is still full */
- }
- else if ((b & (b - 1)) == 0) { /* byte has only one bit? */
- if (count > 0)
- return ISet; /* set is neither full nor empty */
- else { /* set has only one char till now; track it */
- count++;
- candidate = i;
- }
- }
- else return ISet; /* byte is neither empty, full, nor singleton */
- }
- switch (count) {
- case 0: return IFail; /* empty set */
- case 1: { /* singleton; find character bit inside byte */
- int b = cs[candidate];
- *c = candidate * BITSPERCHAR;
- if ((b & 0xF0) != 0) { *c += 4; b >>= 4; }
- if ((b & 0x0C) != 0) { *c += 2; b >>= 2; }
- if ((b & 0x02) != 0) { *c += 1; }
- return IChar;
- }
- default: {
- assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */
- return IAny;
- }
- }
- }
- /*
- ** A few basic operations on Charsets
- */
- static void cs_complement (Charset *cs) {
- loopset(i, cs->cs[i] = ~cs->cs[i]);
- }
- static int cs_equal (const byte *cs1, const byte *cs2) {
- loopset(i, if (cs1[i] != cs2[i]) return 0);
- return 1;
- }
- /*
- ** computes whether sets cs1 and cs2 are disjoint
- */
- static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
- loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;)
- return 1;
- }
- /*
- ** Convert a 'char' pattern (TSet, TChar, TAny) to a charset
- */
- int tocharset (TTree *tree, Charset *cs) {
- switch (tree->tag) {
- case TSet: { /* copy set */
- loopset(i, cs->cs[i] = treebuffer(tree)[i]);
- return 1;
- }
- case TChar: { /* only one char */
- assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
- loopset(i, cs->cs[i] = 0); /* erase all chars */
- setchar(cs->cs, tree->u.n); /* add that one */
- return 1;
- }
- case TAny: {
- loopset(i, cs->cs[i] = 0xFF); /* add all to the set */
- return 1;
- }
- default: return 0;
- }
- }
- /*
- ** Checks whether a pattern has captures
- */
- int hascaptures (TTree *tree) {
- tailcall:
- switch (tree->tag) {
- case TCapture: case TRunTime:
- return 1;
- default: {
- switch (numsiblings[tree->tag]) {
- case 1: /* return hascaptures(sib1(tree)); */
- tree = sib1(tree); goto tailcall;
- case 2:
- if (hascaptures(sib1(tree))) return 1;
- /* else return hascaptures(sib2(tree)); */
- tree = sib2(tree); goto tailcall;
- default: assert(numsiblings[tree->tag] == 0); return 0;
- }
- }
- }
- }
- /*
- ** Checks how a pattern behaves regarding the empty string,
- ** in one of two different ways:
- ** A pattern is *nullable* if it can match without consuming any character;
- ** A pattern is *nofail* if it never fails for any string
- ** (including the empty string).
- ** The difference is only for predicates and run-time captures;
- ** for other patterns, the two properties are equivalent.
- ** (With predicates, &'a' is nullable but not nofail. Of course,
- ** nofail => nullable.)
- ** These functions are all convervative in the following way:
- ** p is nullable => nullable(p)
- ** nofail(p) => p cannot fail
- ** The function assumes that TOpenCall is not nullable;
- ** this will be checked again when the grammar is fixed.)
- ** Run-time captures can do whatever they want, so the result
- ** is conservative.
- */
- int checkaux (TTree *tree, int pred) {
- tailcall:
- switch (tree->tag) {
- case TChar: case TSet: case TAny:
- case TFalse: case TOpenCall:
- return 0; /* not nullable */
- case TRep: case TTrue:
- return 1; /* no fail */
- case TNot: case TBehind: /* can match empty, but can fail */
- if (pred == PEnofail) return 0;
- else return 1; /* PEnullable */
- case TAnd: /* can match empty; fail iff body does */
- if (pred == PEnullable) return 1;
- /* else return checkaux(sib1(tree), pred); */
- tree = sib1(tree); goto tailcall;
- case TRunTime: /* can fail; match empty iff body does */
- if (pred == PEnofail) return 0;
- /* else return checkaux(sib1(tree), pred); */
- tree = sib1(tree); goto tailcall;
- case TSeq:
- if (!checkaux(sib1(tree), pred)) return 0;
- /* else return checkaux(sib2(tree), pred); */
- tree = sib2(tree); goto tailcall;
- case TChoice:
- if (checkaux(sib2(tree), pred)) return 1;
- /* else return checkaux(sib1(tree), pred); */
- tree = sib1(tree); goto tailcall;
- case TCapture: case TGrammar: case TRule:
- /* return checkaux(sib1(tree), pred); */
- tree = sib1(tree); goto tailcall;
- case TCall: /* return checkaux(sib2(tree), pred); */
- tree = sib2(tree); goto tailcall;
- default: assert(0); return 0;
- };
- }
- /*
- ** number of characters to match a pattern (or -1 if variable)
- ** ('count' avoids infinite loops for grammars)
- */
- int fixedlenx (TTree *tree, int count, int len) {
- tailcall:
- switch (tree->tag) {
- case TChar: case TSet: case TAny:
- return len + 1;
- case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
- return len;
- case TRep: case TRunTime: case TOpenCall:
- return -1;
- case TCapture: case TRule: case TGrammar:
- /* return fixedlenx(sib1(tree), count); */
- tree = sib1(tree); goto tailcall;
- case TCall:
- if (count++ >= MAXRULES)
- return -1; /* may be a loop */
- /* else return fixedlenx(sib2(tree), count); */
- tree = sib2(tree); goto tailcall;
- case TSeq: {
- len = fixedlenx(sib1(tree), count, len);
- if (len < 0) return -1;
- /* else return fixedlenx(sib2(tree), count, len); */
- tree = sib2(tree); goto tailcall;
- }
- case TChoice: {
- int n1, n2;
- n1 = fixedlenx(sib1(tree), count, len);
- if (n1 < 0) return -1;
- n2 = fixedlenx(sib2(tree), count, len);
- if (n1 == n2) return n1;
- else return -1;
- }
- default: assert(0); return 0;
- };
- }
- /*
- ** Computes the 'first set' of a pattern.
- ** The result is a conservative aproximation:
- ** match p ax -> x' for some x ==> a in first(p).
- ** The set 'follow' is the first set of what follows the
- ** pattern (full set if nothing follows it).
- ** The function returns 0 when this set can be used for
- ** tests that avoid the pattern altogether.
- ** A non-zero return can happen for two reasons:
- ** 1) match p '' -> '' ==> returns 1.
- ** (tests cannot be used because they always fail for an empty input)
- ** 2) there is a match-time capture ==> returns 2.
- ** (match-time captures should not be avoided by optimizations)
- */
- static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
- tailcall:
- switch (tree->tag) {
- case TChar: case TSet: case TAny: {
- tocharset(tree, firstset);
- return 0;
- }
- case TTrue: {
- loopset(i, firstset->cs[i] = follow->cs[i]);
- return 1;
- }
- case TFalse: {
- loopset(i, firstset->cs[i] = 0);
- return 0;
- }
- case TChoice: {
- Charset csaux;
- int e1 = getfirst(sib1(tree), follow, firstset);
- int e2 = getfirst(sib2(tree), follow, &csaux);
- loopset(i, firstset->cs[i] |= csaux.cs[i]);
- return e1 | e2;
- }
- case TSeq: {
- if (!nullable(sib1(tree))) {
- /* return getfirst(sib1(tree), fullset, firstset); */
- tree = sib1(tree); follow = fullset; goto tailcall;
- }
- else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */
- Charset csaux;
- int e2 = getfirst(sib2(tree), follow, &csaux);
- int e1 = getfirst(sib1(tree), &csaux, firstset);
- if (e1 == 0) return 0; /* 'e1' ensures that first can be used */
- else if ((e1 | e2) & 2) /* one of the children has a matchtime? */
- return 2; /* pattern has a matchtime capture */
- else return e2; /* else depends on 'e2' */
- }
- }
- case TRep: {
- getfirst(sib1(tree), follow, firstset);
- loopset(i, firstset->cs[i] |= follow->cs[i]);
- return 1; /* accept the empty string */
- }
- case TCapture: case TGrammar: case TRule: {
- /* return getfirst(sib1(tree), follow, firstset); */
- tree = sib1(tree); goto tailcall;
- }
- case TRunTime: { /* function invalidates any follow info. */
- int e = getfirst(sib1(tree), fullset, firstset);
- if (e) return 2; /* function is not "protected"? */
- else return 0; /* pattern inside capture ensures first can be used */
- }
- case TCall: {
- /* return getfirst(sib2(tree), follow, firstset); */
- tree = sib2(tree); goto tailcall;
- }
- case TAnd: {
- int e = getfirst(sib1(tree), follow, firstset);
- loopset(i, firstset->cs[i] &= follow->cs[i]);
- return e;
- }
- case TNot: {
- if (tocharset(sib1(tree), firstset)) {
- cs_complement(firstset);
- return 1;
- }
- /* else go through */
- }
- case TBehind: { /* instruction gives no new information */
- /* call 'getfirst' to check for math-time captures */
- int e = getfirst(sib1(tree), follow, firstset);
- loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */
- return e | 1; /* always can accept the empty string */
- }
- default: assert(0); return 0;
- }
- }
- /*
- ** If it returns true, then pattern can fail only depending on the next
- ** character of the subject
- */
- static int headfail (TTree *tree) {
- tailcall:
- switch (tree->tag) {
- case TChar: case TSet: case TAny: case TFalse:
- return 1;
- case TTrue: case TRep: case TRunTime: case TNot:
- case TBehind:
- return 0;
- case TCapture: case TGrammar: case TRule: case TAnd:
- tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
- case TCall:
- tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */
- case TSeq:
- if (!nofail(sib2(tree))) return 0;
- /* else return headfail(sib1(tree)); */
- tree = sib1(tree); goto tailcall;
- case TChoice:
- if (!headfail(sib1(tree))) return 0;
- /* else return headfail(sib2(tree)); */
- tree = sib2(tree); goto tailcall;
- default: assert(0); return 0;
- }
- }
- /*
- ** Check whether the code generation for the given tree can benefit
- ** from a follow set (to avoid computing the follow set when it is
- ** not needed)
- */
- static int needfollow (TTree *tree) {
- tailcall:
- switch (tree->tag) {
- case TChar: case TSet: case TAny:
- case TFalse: case TTrue: case TAnd: case TNot:
- case TRunTime: case TGrammar: case TCall: case TBehind:
- return 0;
- case TChoice: case TRep:
- return 1;
- case TCapture:
- tree = sib1(tree); goto tailcall;
- case TSeq:
- tree = sib2(tree); goto tailcall;
- default: assert(0); return 0;
- }
- }
- /* }====================================================== */
- /*
- ** {======================================================
- ** Code generation
- ** =======================================================
- */
- /*
- ** size of an instruction
- */
- int sizei (const Instruction *i) {
- switch((Opcode)i->i.code) {
- case ISet: case ISpan: return CHARSETINSTSIZE;
- case ITestSet: return CHARSETINSTSIZE + 1;
- case ITestChar: case ITestAny: case IChoice: case IJmp:
- case ICall: case IOpenCall: case ICommit: case IPartialCommit:
- case IBackCommit: return 2;
- default: return 1;
- }
- }
- /*
- ** state for the compiler
- */
- typedef struct CompileState {
- Pattern *p; /* pattern being compiled */
- int ncode; /* next position in p->code to be filled */
- lua_State *L;
- } CompileState;
- /*
- ** code generation is recursive; 'opt' indicates that the code is
- ** being generated under a 'IChoice' operator jumping to its end.
- ** 'tt' points to a previous test protecting this code. 'fl' is
- ** the follow set of the pattern.
- */
- static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
- const Charset *fl);
- void reallocprog (lua_State *L, Pattern *p, int nsize) {
- void *ud;
- lua_Alloc f = lua_getallocf(L, &ud);
- void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction),
- nsize * sizeof(Instruction));
- if (newblock == NULL && nsize > 0)
- luaL_error(L, "not enough memory");
- p->code = (Instruction *)newblock;
- p->codesize = nsize;
- }
- static int nextinstruction (CompileState *compst) {
- int size = compst->p->codesize;
- if (compst->ncode >= size)
- reallocprog(compst->L, compst->p, size * 2);
- return compst->ncode++;
- }
- #define getinstr(cs,i) ((cs)->p->code[i])
- static int addinstruction (CompileState *compst, Opcode op, int aux) {
- int i = nextinstruction(compst);
- getinstr(compst, i).i.code = op;
- getinstr(compst, i).i.aux = aux;
- return i;
- }
- static int addoffsetinst (CompileState *compst, Opcode op) {
- int i = addinstruction(compst, op, 0); /* instruction */
- addinstruction(compst, (Opcode)0, 0); /* open space for offset */
- assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2);
- return i;
- }
- static void setoffset (CompileState *compst, int instruction, int offset) {
- getinstr(compst, instruction + 1).offset = offset;
- }
- /*
- ** Add a capture instruction:
- ** 'op' is the capture instruction; 'cap' the capture kind;
- ** 'key' the key into ktable; 'aux' is optional offset
- **
- */
- static int addinstcap (CompileState *compst, Opcode op, int cap, int key,
- int aux) {
- int i = addinstruction(compst, op, joinkindoff(cap, aux));
- getinstr(compst, i).i.key = key;
- return i;
- }
- #define gethere(compst) ((compst)->ncode)
- #define target(code,i) ((i) + code[i + 1].offset)
- static void jumptothere (CompileState *compst, int instruction, int target) {
- if (instruction >= 0)
- setoffset(compst, instruction, target - instruction);
- }
- static void jumptohere (CompileState *compst, int instruction) {
- jumptothere(compst, instruction, gethere(compst));
- }
- /*
- ** Code an IChar instruction, or IAny if there is an equivalent
- ** test dominating it
- */
- static void codechar (CompileState *compst, int c, int tt) {
- if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar &&
- getinstr(compst, tt).i.aux == c)
- addinstruction(compst, IAny, 0);
- else
- addinstruction(compst, IChar, c);
- }
- /*
- ** Add a charset posfix to an instruction
- */
- static void addcharset (CompileState *compst, const byte *cs) {
- int p = gethere(compst);
- int i;
- for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
- nextinstruction(compst); /* space for buffer */
- /* fill buffer with charset */
- loopset(j, getinstr(compst, p).buff[j] = cs[j]);
- }
- /*
- ** code a char set, optimizing unit sets for IChar, "complete"
- ** sets for IAny, and empty sets for IFail; also use an IAny
- ** when instruction is dominated by an equivalent test.
- */
- static void codecharset (CompileState *compst, const byte *cs, int tt) {
- int c = 0; /* (=) to avoid warnings */
- Opcode op = charsettype(cs, &c);
- switch (op) {
- case IChar: codechar(compst, c, tt); break;
- case ISet: { /* non-trivial set? */
- if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet &&
- cs_equal(cs, getinstr(compst, tt + 2).buff))
- addinstruction(compst, IAny, 0);
- else {
- addinstruction(compst, ISet, 0);
- addcharset(compst, cs);
- }
- break;
- }
- default: addinstruction(compst, op, c); break;
- }
- }
- /*
- ** code a test set, optimizing unit sets for ITestChar, "complete"
- ** sets for ITestAny, and empty sets for IJmp (always fails).
- ** 'e' is true iff test should accept the empty string. (Test
- ** instructions in the current VM never accept the empty string.)
- */
- static int codetestset (CompileState *compst, Charset *cs, int e) {
- if (e) return NOINST; /* no test */
- else {
- int c = 0;
- Opcode op = charsettype(cs->cs, &c);
- switch (op) {
- case IFail: return addoffsetinst(compst, IJmp); /* always jump */
- case IAny: return addoffsetinst(compst, ITestAny);
- case IChar: {
- int i = addoffsetinst(compst, ITestChar);
- getinstr(compst, i).i.aux = c;
- return i;
- }
- case ISet: {
- int i = addoffsetinst(compst, ITestSet);
- addcharset(compst, cs->cs);
- return i;
- }
- default: assert(0); return 0;
- }
- }
- }
- /*
- ** Find the final destination of a sequence of jumps
- */
- static int finaltarget (Instruction *code, int i) {
- while (code[i].i.code == IJmp)
- i = target(code, i);
- return i;
- }
- /*
- ** final label (after traversing any jumps)
- */
- static int finallabel (Instruction *code, int i) {
- return finaltarget(code, target(code, i));
- }
- /*
- ** <behind(p)> == behind n; <p> (where n = fixedlen(p))
- */
- static void codebehind (CompileState *compst, TTree *tree) {
- if (tree->u.n > 0)
- addinstruction(compst, IBehind, tree->u.n);
- codegen(compst, sib1(tree), 0, NOINST, fullset);
- }
- /*
- ** Choice; optimizations:
- ** - when p1 is headfail
- ** - when first(p1) and first(p2) are disjoint; than
- ** a character not in first(p1) cannot go to p1, and a character
- ** in first(p1) cannot go to p2 (at it is not in first(p2)).
- ** (The optimization is not valid if p1 accepts the empty string,
- ** as then there is no character at all...)
- ** - when p2 is empty and opt is true; a IPartialCommit can resuse
- ** the Choice already active in the stack.
- */
- static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
- const Charset *fl) {
- int emptyp2 = (p2->tag == TTrue);
- Charset cs1, cs2;
- int e1 = getfirst(p1, fullset, &cs1);
- if (headfail(p1) ||
- (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) {
- /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */
- int test = codetestset(compst, &cs1, 0);
- int jmp = NOINST;
- codegen(compst, p1, 0, test, fl);
- if (!emptyp2)
- jmp = addoffsetinst(compst, IJmp);
- jumptohere(compst, test);
- codegen(compst, p2, opt, NOINST, fl);
- jumptohere(compst, jmp);
- }
- else if (opt && emptyp2) {
- /* p1? == IPartialCommit; p1 */
- jumptohere(compst, addoffsetinst(compst, IPartialCommit));
- codegen(compst, p1, 1, NOINST, fullset);
- }
- else {
- /* <p1 / p2> ==
- test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
- int pcommit;
- int test = codetestset(compst, &cs1, e1);
- int pchoice = addoffsetinst(compst, IChoice);
- codegen(compst, p1, emptyp2, test, fullset);
- pcommit = addoffsetinst(compst, ICommit);
- jumptohere(compst, pchoice);
- jumptohere(compst, test);
- codegen(compst, p2, opt, NOINST, fl);
- jumptohere(compst, pcommit);
- }
- }
- /*
- ** And predicate
- ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
- ** (valid only when 'p' has no captures)
- */
- static void codeand (CompileState *compst, TTree *tree, int tt) {
- int n = fixedlen(tree);
- if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) {
- codegen(compst, tree, 0, tt, fullset);
- if (n > 0)
- addinstruction(compst, IBehind, n);
- }
- else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */
- int pcommit;
- int pchoice = addoffsetinst(compst, IChoice);
- codegen(compst, tree, 0, tt, fullset);
- pcommit = addoffsetinst(compst, IBackCommit);
- jumptohere(compst, pchoice);
- addinstruction(compst, IFail, 0);
- jumptohere(compst, pcommit);
- }
- }
- /*
- ** Captures: if pattern has fixed (and not too big) length, use
- ** a single IFullCapture instruction after the match; otherwise,
- ** enclose the pattern with OpenCapture - CloseCapture.
- */
- static void codecapture (CompileState *compst, TTree *tree, int tt,
- const Charset *fl) {
- int len = fixedlen(sib1(tree));
- if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) {
- codegen(compst, sib1(tree), 0, tt, fl);
- addinstcap(compst, IFullCapture, tree->cap, tree->key, len);
- }
- else {
- addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0);
- codegen(compst, sib1(tree), 0, tt, fl);
- addinstcap(compst, ICloseCapture, Cclose, 0, 0);
- }
- }
- static void coderuntime (CompileState *compst, TTree *tree, int tt) {
- addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0);
- codegen(compst, sib1(tree), 0, tt, fullset);
- addinstcap(compst, ICloseRunTime, Cclose, 0, 0);
- }
- /*
- ** Repetion; optimizations:
- ** When pattern is a charset, can use special instruction ISpan.
- ** When pattern is head fail, or if it starts with characters that
- ** are disjoint from what follows the repetions, a simple test
- ** is enough (a fail inside the repetition would backtrack to fail
- ** again in the following pattern, so there is no need for a choice).
- ** When 'opt' is true, the repetion can reuse the Choice already
- ** active in the stack.
- */
- static void coderep (CompileState *compst, TTree *tree, int opt,
- const Charset *fl) {
- Charset st;
- if (tocharset(tree, &st)) {
- addinstruction(compst, ISpan, 0);
- addcharset(compst, st.cs);
- }
- else {
- int e1 = getfirst(tree, fullset, &st);
- if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
- /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
- int jmp;
- int test = codetestset(compst, &st, 0);
- codegen(compst, tree, opt, test, fullset);
- jmp = addoffsetinst(compst, IJmp);
- jumptohere(compst, test);
- jumptothere(compst, jmp, test);
- }
- else {
- /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */
- /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */
- int commit, l2;
- int test = codetestset(compst, &st, e1);
- int pchoice = NOINST;
- if (opt)
- jumptohere(compst, addoffsetinst(compst, IPartialCommit));
- else
- pchoice = addoffsetinst(compst, IChoice);
- l2 = gethere(compst);
- codegen(compst, tree, 0, NOINST, fullset);
- commit = addoffsetinst(compst, IPartialCommit);
- jumptothere(compst, commit, l2);
- jumptohere(compst, pchoice);
- jumptohere(compst, test);
- }
- }
- }
- /*
- ** Not predicate; optimizations:
- ** In any case, if first test fails, 'not' succeeds, so it can jump to
- ** the end. If pattern is headfail, that is all (it cannot fail
- ** in other parts); this case includes 'not' of simple sets. Otherwise,
- ** use the default code (a choice plus a failtwice).
- */
- static void codenot (CompileState *compst, TTree *tree) {
- Charset st;
- int e = getfirst(tree, fullset, &st);
- int test = codetestset(compst, &st, e);
- if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */
- addinstruction(compst, IFail, 0);
- else {
- /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */
- int pchoice = addoffsetinst(compst, IChoice);
- codegen(compst, tree, 0, NOINST, fullset);
- addinstruction(compst, IFailTwice, 0);
- jumptohere(compst, pchoice);
- }
- jumptohere(compst, test);
- }
- /*
- ** change open calls to calls, using list 'positions' to find
- ** correct offsets; also optimize tail calls
- */
- static void correctcalls (CompileState *compst, int *positions,
- int from, int to) {
- int i;
- Instruction *code = compst->p->code;
- for (i = from; i < to; i += sizei(&code[i])) {
- if (code[i].i.code == IOpenCall) {
- int n = code[i].i.key; /* rule number */
- int rule = positions[n]; /* rule position */
- assert(rule == from || code[rule - 1].i.code == IRet);
- if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */
- code[i].i.code = IJmp; /* tail call */
- else
- code[i].i.code = ICall;
- jumptothere(compst, i, rule); /* call jumps to respective rule */
- }
- }
- assert(i == to);
- }
- /*
- ** Code for a grammar:
- ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
- */
- static void codegrammar (CompileState *compst, TTree *grammar) {
- int positions[MAXRULES];
- int rulenumber = 0;
- TTree *rule;
- int firstcall = addoffsetinst(compst, ICall); /* call initial rule */
- int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */
- int start = gethere(compst); /* here starts the initial rule */
- jumptohere(compst, firstcall);
- for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
- positions[rulenumber++] = gethere(compst); /* save rule position */
- codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */
- addinstruction(compst, IRet, 0);
- }
- assert(rule->tag == TTrue);
- jumptohere(compst, jumptoend);
- correctcalls(compst, positions, start, gethere(compst));
- }
- static void codecall (CompileState…
Large files files are truncated, but you can click here to view the full file