/src/ikj/main/org/jregex/Term.java
Java | 2184 lines | 1768 code | 216 blank | 200 comment | 145 complexity | 15c7d248b934c305aaa85190dc8ee590 MD5 | raw file
Possible License(s): BSD-3-Clause
Large files files are truncated, but you can click here to view the full file
- /**
- * Copyright (c) 2001, Sergey A. Samokhodkin
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * - Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * - Redistributions in binary form
- * must reproduce the above copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials provided with the distribution.
- * - Neither the name of jregex nor the names of its contributors may be used
- * to endorse or promote products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
- * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @version 1.2_01
- */
- package org.jregex;
- import java.util.*;
- class Term implements REFlags{
- public enum TermType {
- //runtime Term types
- CHAR, BITSET, BITSET2, ANY_CHAR, ANY_CHAR_NE,
- REG, REG_I, FIND, FINDREG, SUCCESS,
- /*optimization-transparent types*/
- BOUNDARY, DIRECTION, UBOUNDARY, UDIRECTION,
- GROUP_IN, GROUP_OUT, VOID,
- START, END, END_EOL, LINE_START, LINE_END, LAST_MATCH_END,
- CNT_SET_0, CNT_INC, CNT_GT_EQ, READ_CNT_LT,
- // CTSTORE_CRINC: store on 'actual' search entry
- CRSTORE_CRINC, CR_SET_0, CR_LT, CR_GT_EQ,
- /*optimization-nontransparent types*/
- BRANCH, BRANCH_STORE_CNT, BRANCH_STORE_CNT_AUX1,
- // INDEPENDENT_IN: functionally the same as NLOOKAHEAD_IN
- PLOOKAHEAD_IN, PLOOKAHEAD_OUT, NLOOKAHEAD_IN, NLOOKAHEAD_OUT, PLOOKBEHIND_IN,
- PLOOKBEHIND_OUT, NLOOKBEHIND_IN, NLOOKBEHIND_OUT, INDEPENDENT_IN, INDEPENDENT_OUT,
- REPEAT_0_INF, REPEAT_MIN_INF, REPEAT_MIN_MAX, REPEAT_REG_MIN_INF, REPEAT_REG_MIN_MAX,
- BACKTRACK_0, BACKTRACK_MIN, BACKTRACK_FIND_MIN, BACKTRACK_FINDREG_MIN, BACKTRACK_REG_MIN,
- MEMREG_CONDITION, LOOKAHEAD_CONDITION_IN, LOOKAHEAD_CONDITION_OUT, LOOKBEHIND_CONDITION_IN,
- LOOKBEHIND_CONDITION_OUT
- }
- // compiletime: length of vars[] (see makeTree())
- static final int VARS_LENGTH=4;
- // compiletime variable indicies:
- private static final int MEMREG_COUNT=0; //refers current memreg index
- private static final int CNTREG_COUNT=1; //refers current counters number
- private static final int DEPTH=2; //refers current depth: (((depth=3)))
- private static final int LOOKAHEAD_COUNT=3; //refers current memreg index
- private static final int LIMITS_LENGTH=3;
- private static final int LIMITS_PARSE_RESULT_INDEX=2;
- private static final int LIMITS_OK=1;
- private static final int LIMITS_FAILURE=2;
- //static CustomParser[] customParsers=new CustomParser[256];
- // **** CONTROL FLOW ****
- // next-to-execute and next-if-failed commands;
- Term next,failNext;
- // **** TYPES ****
- TermType type=TermType.VOID;
- boolean inverse;
- // used with type=CHAR
- char c;
- // used with type=FIND
- int distance;
- boolean eat;
- // used with type=BITSET(2);
- boolean[] bitset;
- boolean[][] bitset2;
- boolean[] categoryBitset; //types(unicode categories)
- // used with type=BALANCE;
- char[] brackets;
- // used for optimization with type=BITSET,BITSET2
- int weight;
- // **** MEMORISATION ****
- // memory slot, used with type=REG,GROUP_IN,GROUP_OUT
- int memreg=-1;
- // **** COUNTERS ****
- // max|min number of iterations
- // used with CNT_GT_EQ ,REPEAT_* etc.;
- int minCount,maxCount;
- // used with REPEAT_*,REPEAT_REG_*;
- Term target;
- // a counter slot to increment & compare with maxCount (CNT_INC etc.);
- int cntreg=0;
- // lookahead group id;
- int lookaheadId;
- // **** COMPILE HELPERS ****
- protected Term prev,in,out,out1,first,current;
- //new!!
- protected Term branchOut;
- //protected boolean newBranch=false,closed=false;
- //protected boolean newBranch=false;
- //for debugging
- static int instances;
- int instanceNum;
- Term(){
- //for debugging
- instanceNum=instances;
- instances++;
- in=out=this;
- }
- Term(TermType type){
- this();
- this.type=type;
- }
- static void makeTree(String s, int flags,Pattern re) throws PatternSyntaxException{
- char[] data=s.toCharArray();
- makeTree(data,0,data.length,flags,re);
- }
- static void makeTree(char[] data,int offset,int end,
- int flags,Pattern re) throws PatternSyntaxException{
- // memreg,counter,depth,lookahead
- int[] vars={1,0,0,0}; //don't use counters[0]
- //collect iterators for subsequent optimization
- List iterators=new ArrayList();
- Map groupNames=new LinkedHashMap();
- Pretokenizer t=new Pretokenizer(data,offset,end);
- Term term=makeTree(t,data,vars,flags,new Group(),iterators,groupNames);
- // term=(0-...-0)
- // convert closing outer bracket into success term
- term.out.type=TermType.SUCCESS;
- // term=(0-...-!!!
- //throw out opening bracket
- Term first=term.next;
- // term=...-!!!
- // Optimisation:
- Term optimized=first;
- Optimizer opt=Optimizer.find(first);
- if(opt!=null) optimized=opt.makeFirst(first);
- java.util.Iterator en=iterators.iterator();
- while(en.hasNext()){
- Iterator i=(Iterator)en.next();
- i.optimize();
- }
- // ===
- re.root=optimized;
- re.root0=first;
- re.memregs=vars[MEMREG_COUNT];
- re.counters=vars[CNTREG_COUNT];
- re.lookaheads=vars[LOOKAHEAD_COUNT];
- re.namedGroupMap=groupNames;
- }
- private static Term makeTree(Pretokenizer t,char[] data,int[] vars,
- int flags,Term term,List iterators,Map groupNames) throws PatternSyntaxException{
- //System.out.println("Term.makeTree(): flags="+flags);
- if(vars.length!=VARS_LENGTH) throw new IllegalArgumentException("vars.length should be "+VARS_LENGTH+", not "+vars.length);
- //Term term=new Term(isMemReg? vars[MEMREG_COUNT]: -1);
- // use memreg 0 as unsignificant
- //Term term=new Group(isMemReg? vars[MEMREG_COUNT]: 0);
- while(true){
- t.next();
- term.append(t.tOffset,t.tOutside,data,vars,flags,iterators,groupNames);
- switch(t.ttype){
- case Pretokenizer.FLAGS:
- flags=t.flags(flags);
- continue;
- case Pretokenizer.CLASS_GROUP:
- t.next();
- Term clg=new Term();
- CharacterClass.parseGroup(data,t.tOffset,t.tOutside,clg,
- (flags&IGNORE_CASE)>0, (flags&IGNORE_SPACES)>0,
- (flags&UNICODE)>0, (flags&XML_SCHEMA)>0);
- term.append(clg);
- continue;
- case Pretokenizer.PLAIN_GROUP:
- vars[DEPTH]++;
- //System.out.println("PLAIN_GROUP, t.tOffset="+t.tOffset+", t.tOutside="+t.tOutside+", t.flags("+flags+")="+t.flags(flags));
- term.append(makeTree(t,data,vars,t.flags(flags),new Group(),iterators,groupNames));
- break;
- case Pretokenizer.NAMED_GROUP:
- String gname=t.groupName;
- int id;
- if(Character.isDigit(gname.charAt(0))){
- try{
- id=Integer.parseInt(gname);
- }
- catch(NumberFormatException e){
- throw new PatternSyntaxException("group name starts with digit but is not a number");
- }
- if(groupNames.containsValue(new Integer(id))){
- if(t.groupDeclared) throw new PatternSyntaxException("group redeclaration: "+gname+"; use ({=id}...) for multiple group assignments");
- }
- if(vars[MEMREG_COUNT]<=id)vars[MEMREG_COUNT]=id+1;
- }
- else{
- Integer no=(Integer)groupNames.get(gname);
- if(no==null){
- id=vars[MEMREG_COUNT]++;
- groupNames.put(t.groupName,new Integer(id));
- }
- else{
- if(t.groupDeclared) throw new PatternSyntaxException("group redeclaration "+gname+"; use ({=name}...) for group reassignments");
- id=no.intValue();
- }
- }
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new Group(id),iterators,groupNames));
- break;
- case '(':
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new Group(vars[MEMREG_COUNT]++),iterators,groupNames));
- break;
- case Pretokenizer.POS_LOOKAHEAD:
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new Lookahead(vars[LOOKAHEAD_COUNT]++,true),iterators,groupNames));
- break;
- case Pretokenizer.NEG_LOOKAHEAD:
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new Lookahead(vars[LOOKAHEAD_COUNT]++,false),iterators,groupNames));
- break;
- case Pretokenizer.POS_LOOKBEHIND:
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new Lookbehind(vars[LOOKAHEAD_COUNT]++,true),iterators,groupNames));
- break;
- case Pretokenizer.NEG_LOOKBEHIND:
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new Lookbehind(vars[LOOKAHEAD_COUNT]++,false),iterators,groupNames));
- break;
- case Pretokenizer.INDEPENDENT_REGEX:
- vars[DEPTH]++;
- term.append(makeTree(t,data,vars,flags,new IndependentGroup(vars[LOOKAHEAD_COUNT]++),iterators,groupNames));
- break;
- case Pretokenizer.CONDITIONAL_GROUP:
- vars[DEPTH]++;
- t.next();
- Term fork=null;
- boolean positive=true;
- switch(t.ttype){
- case Pretokenizer.NEG_LOOKAHEAD:
- positive=false;
- case Pretokenizer.POS_LOOKAHEAD:
- vars[DEPTH]++;
- Lookahead la=new Lookahead(vars[LOOKAHEAD_COUNT]++,positive);
- makeTree(t,data,vars,flags,la,iterators,groupNames);
- fork=new ConditionalExpr(la);
- break;
- case Pretokenizer.NEG_LOOKBEHIND:
- positive=false;
- case Pretokenizer.POS_LOOKBEHIND:
- vars[DEPTH]++;
- Lookbehind lb=new Lookbehind(vars[LOOKAHEAD_COUNT]++,positive);
- makeTree(t,data,vars,flags,lb,iterators,groupNames);
- fork=new ConditionalExpr(lb);
- break;
- case '(':
- t.next();
- if(t.ttype!=')') throw new PatternSyntaxException("malformed condition");
- int memregNo;
- if(Character.isDigit(data[t.tOffset])) memregNo=makeNumber(t.tOffset,t.tOutside,data);
- else{
- String gn=new String(data,t.tOffset,t.tOutside-t.tOffset);
- Integer gno=(Integer)groupNames.get(gn);
- if(gno==null) throw new PatternSyntaxException("unknown group name in conditional expr.: "+gn);
- memregNo=gno.intValue();
- }
- fork=new ConditionalExpr(memregNo);
- break;
- default:
- throw new PatternSyntaxException("malformed conditional expression: "+t.ttype+" '"+(char)t.ttype+"'");
- }
- term.append(makeTree(t,data,vars,flags,fork,iterators,groupNames));
- break;
- case '|':
- term.newBranch();
- break;
- case Pretokenizer.END:
- if(vars[DEPTH]>0) throw new PatternSyntaxException("unbalanced parenthesis");
- term.close();
- return term;
- case ')':
- if(vars[DEPTH]<=0) throw new PatternSyntaxException("unbalanced parenthesis");
- term.close();
- vars[DEPTH]--;
- return term;
- case Pretokenizer.COMMENT:
- while(t.ttype!=')') t.next();
- continue;
- default:
- throw new PatternSyntaxException("unknown token type: "+t.ttype);
- }
- }
- }
- static int makeNumber(int off, int out, char[] data){
- int n=0;
- for(int i=off;i<out;i++){
- int d=data[i]-'0';
- if(d<0 || d>9) return -1;
- n*=10;
- n+=d;
- }
- return n;
- }
- protected void append(int offset,int end,char[] data,
- int[] vars,int flags,List iterators,Map gmap) throws PatternSyntaxException{
- //System.out.println("append("+new String(data,offset,end-offset)+")");
- //System.out.println("current="+this.current);
- int[] limits=new int[3];
- int i=offset;
- Term tmp,current=this.current;
- while(i<end){
- char c=data[i];
- boolean greedy=true;
- switch(c){
- //operations
- case '*':
- if(current==null) throw new PatternSyntaxException("missing term before *");
- i++;
- if(i<end){
- switch(data[i]) {
- case '?':
- greedy^=true;
- i++;
- break;
- case '*':
- case '+':
- throw new PatternSyntaxException("nested *?+ in regexp");
- }
- }
- tmp=greedy? makeGreedyStar(vars,current,iterators):
- makeLazyStar(vars,current);
- current=replaceCurrent(tmp);
- break;
- case '+':
- if(current==null) throw new PatternSyntaxException("missing term before +");
- i++;
- if(i<end){
- switch(data[i]) {
- case '?':
- greedy^=true;
- i++;
- break;
- case '*':
- case '+':
- throw new PatternSyntaxException("nested *?+ in regexp");
- }
- }
- tmp=greedy? makeGreedyPlus(vars,current,iterators):
- makeLazyPlus(vars,current);
- current=replaceCurrent(tmp);
- break;
- case '?':
- if(current==null) throw new PatternSyntaxException("missing term before ?");
- i++;
- if(i<end){
- switch(data[i]) {
- case '?':
- greedy^=true;
- i++;
- break;
- case '*':
- case '+':
- throw new PatternSyntaxException("nested *?+ in regexp");
- }
- }
- tmp=greedy? makeGreedyQMark(vars,current):
- makeLazyQMark(vars,current);
- current=replaceCurrent(tmp);
- break;
- case '{':
- limits[0]=0;
- limits[1]=-1;
- int le=parseLimits(i+1,end,data,limits);
- if(limits[LIMITS_PARSE_RESULT_INDEX]==LIMITS_OK){ //parse ok
- if(current==null) throw new PatternSyntaxException("missing term before {}");
- i=le;
- if(i<end && data[i]=='?'){
- greedy^=true;
- i++;
- }
- tmp=greedy? makeGreedyLimits(vars,current,limits,iterators):
- makeLazyLimits(vars,current,limits);
- current=replaceCurrent(tmp);
- break;
- }
- else{ //unicode class or named backreference
- if(data[i+1]=='\\'){ //'{\name}' - backreference
- int p=i+2;
- if(p==end) throw new PatternSyntaxException("'group_id' expected");
- while(Character.isWhitespace(data[p])){
- p++;
- if(p==end) throw new PatternSyntaxException("'group_id' expected");
- }
- BackReference br=new BackReference(-1,(flags&IGNORE_CASE)>0);
- i=parseGroupId(data,p,end,br,gmap);
- current=append(br);
- continue;
- }
- else{
- Term t=new Term();
- i=CharacterClass.parseName(data,i,end,t,false,(flags&IGNORE_SPACES)>0);
- current=append(t);
- continue;
- }
- }
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- if((flags&IGNORE_SPACES)>0){
- i++;
- continue;
- }
- //else go on as default
- //symbolic items
- default:
- tmp=new Term();
- i=parseTerm(data,i,end,tmp,flags);
- if(tmp.type==TermType.END && i<end){
- if((flags&IGNORE_SPACES)>0) {
- i++;
- while(i<end) {
- c=data[i];
- switch(c){
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- i++;
- continue;
- default:
- throw new PatternSyntaxException("'$' is not a last term in the group: <"+new String(data,offset,end-offset)+">");
- }
- }
- } else {
- throw new PatternSyntaxException("'$' is not a last term in the group: <"+new String(data,offset,end-offset)+">");
- }
- }
- //"\A"
- //if(tmp.type==START && i>(offset+1)){
- // throw new PatternSyntaxException("'^' is not a first term in the group: <"+new String(data,offset,end-offset)+">");
- //}
- current=append(tmp);
- break;
- }
- //System.out.println("next term: "+next);
- //System.out.println(" next.out="+next.out);
- //System.out.println(" next.out1="+next.out1);
- //System.out.println(" next.branchOut="+next.branchOut);
- }
- //System.out.println(in.toStringAll());
- //System.out.println("current="+current);
- //System.out.println();
- }
- private static int parseGroupId(char[] data, int i, int end, Term term, Map gmap) throws PatternSyntaxException{
- int id;
- int nstart=i;
- if(Character.isDigit(data[i])){
- while(Character.isDigit(data[i])){
- i++;
- if(i==end) throw new PatternSyntaxException("group_id expected");
- }
- id=makeNumber(nstart,i,data);
- }
- else{
- while(Character.isJavaIdentifierPart(data[i])){
- i++;
- if(i==end) throw new PatternSyntaxException("group_id expected");
- }
- String s=new String(data,nstart,i-nstart);
- Integer no=(Integer)gmap.get(s);
- if(no==null)throw new PatternSyntaxException("backreference to unknown group: "+s);
- id=no.intValue();
- }
- while(Character.isWhitespace(data[i])){
- i++;
- if(i==end) throw new PatternSyntaxException("'}' expected");
- }
- int c=data[i++];
- if(c!='}') throw new PatternSyntaxException("'}' expected");
- term.memreg=id;
- return i;
- }
- protected Term append(Term term) throws PatternSyntaxException{
- //System.out.println("append("+term.toStringAll()+"), this="+toStringAll());
- //Term prev=this.prev;
- Term current=this.current;
- if(current==null){
- //System.out.println("2");
- //System.out.println(" term="+term);
- //System.out.println(" term.in="+term.in);
- in.next=term;
- term.prev=in;
- this.current=term;
- //System.out.println(" result: "+in.toStringAll()+"\r\n");
- return term;
- }
- //System.out.println("3");
- link(current,term);
- //this.prev=current;
- this.current=term;
- //System.out.println(in.toStringAll());
- //System.out.println("current="+this.current);
- //System.out.println();
- return term;
- }
- protected Term replaceCurrent(Term term) throws PatternSyntaxException{
- //System.out.println("replaceCurrent("+term+"), current="+current+", current.prev="+current.prev);
- //Term prev=this.prev;
- Term prev=current.prev;
- if(prev!=null){
- Term in=this.in;
- if(prev==in){
- //in.next=term;
- //term.prev=in;
- in.next=term.in;
- term.in.prev=in;
- }
- else link(prev,term);
- }
- this.current=term;
- //System.out.println(" new current="+this.current);
- return term;
- }
- protected void newBranch() throws PatternSyntaxException{
- //System.out.println("newBranch()");
- close();
- startNewBranch();
- //System.out.println(in.toStringAll());
- //System.out.println("current="+current);
- //System.out.println();
- }
- protected void close() throws PatternSyntaxException{
- //System.out.println("close(), current="+current+", this="+toStringAll());
- //System.out.println();
- //System.out.println("close()");
- //System.out.println("current="+this.current);
- //System.out.println("prev="+this.prev);
- //System.out.println();
- /*
- Term prev=this.prev;
- if(prev!=null){
- Term current=this.current;
- if(current!=null){
- link(prev,current);
- prev=current;
- this.current=null;
- }
- link(prev,out);
- this.prev=null;
- }
- */
- Term current=this.current;
- if(current!=null) linkd(current,out);
- else in.next=out;
- //System.out.println(in.toStringAll());
- //System.out.println("current="+this.current);
- //System.out.println("prev="+this.prev);
- //System.out.println();
- }
- private final static void link(Term term,Term next){
- linkd(term,next.in);
- next.prev=term;
- }
- private final static void linkd(Term term,Term next){
- //System.out.println("linkDirectly(\""+term+"\" -> \""+next+"\")");
- Term prev_out=term.out;
- if(prev_out!=null){
- //System.out.println(" prev_out="+prev_out);
- prev_out.next=next;
- }
- Term prev_out1=term.out1;
- if(prev_out1!=null){
- //System.out.println(" prev_out1="+prev_out1);
- prev_out1.next=next;
- }
- Term prev_branch=term.branchOut;
- if(prev_branch!=null){
- //System.out.println(" prev_branch="+prev_branch);
- prev_branch.failNext=next;
- }
- }
- protected void startNewBranch() throws PatternSyntaxException{
- //System.out.println("newBranch()");
- //System.out.println("before startNewBranch(), this="+toStringAll());
- //System.out.println();
- Term tmp=in.next;
- Term b=new Branch();
- in.next=b;
- b.next=tmp;
- b.in=null;
- b.out=null;
- b.out1=null;
- b.branchOut=b;
- current=b;
- //System.out.println("startNewBranch(), this="+toStringAll());
- //System.out.println();
- }
- private final static Term makeGreedyStar(int[] vars,Term term,List iterators) throws PatternSyntaxException{
- //vars[STACK_SIZE]++;
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case INDEPENDENT_IN:
- case GROUP_IN:{
- Term b=new Branch();
- b.next=term.in;
- term.out.next=b;
- b.in=b;
- b.out=null;
- b.out1=null;
- b.branchOut=b;
- return b;
- }
- default:{
- Iterator i=new Iterator(term,0,-1,iterators);
- return i;
- }
- }
- }
- private final static Term makeLazyStar(int[] vars,Term term){
- //vars[STACK_SIZE]++;
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case GROUP_IN:{
- Term b=new Branch();
- b.failNext=term.in;
- term.out.next=b;
- b.in=b;
- b.out=b;
- b.out1=null;
- b.branchOut=null;
- return b;
- }
- default:{
- Term b=new Branch();
- b.failNext=term;
- term.next=b;
- b.in=b;
- b.out=b;
- b.out1=null;
- b.branchOut=null;
- return b;
- }
- }
- }
- private final static Term makeGreedyPlus(int[] vars,Term term,List iterators) throws PatternSyntaxException{
- //vars[STACK_SIZE]++;
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case INDEPENDENT_IN://?
- case GROUP_IN:{
- //System.out.println("makeGreedyPlus():");
- //System.out.println(" in="+term.in);
- //System.out.println(" out="+term.out);
- Term b=new Branch();
- b.next=term.in;
- term.out.next=b;
- b.in=term.in;
- b.out=null;
- b.out1=null;
- b.branchOut=b;
- //System.out.println(" returning "+b.in);
- return b;
- }
- default:{
- return new Iterator(term,1,-1,iterators);
- }
- }
- }
- private final static Term makeLazyPlus(int[] vars,Term term){
- //vars[STACK_SIZE]++;
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case GROUP_IN:{
- Term b=new Branch();
- term.out.next=b;
- b.failNext=term.in;
- b.in=term.in;
- b.out=b;
- b.out1=null;
- b.branchOut=null;
- return b;
- }
- case REG:
- default:{
- Term b=new Branch();
- term.next=b;
- b.failNext=term;
- b.in=term;
- b.out=b;
- b.out1=null;
- b.branchOut=null;
- return b;
- }
- }
- }
- private final static Term makeGreedyQMark(int[] vars,Term term){
- //vars[STACK_SIZE]++;
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case GROUP_IN:{
- Term b=new Branch();
- b.next=term.in;
- b.in=b;
- b.out=term.out;
- b.out1=null;
- b.branchOut=b;
- return b;
- }
- case REG:
- default:{
- Term b=new Branch();
- b.next=term;
- b.in=b;
- b.out=term;
- b.out1=null;
- b.branchOut=b;
- return b;
- }
- }
- }
- private final static Term makeLazyQMark(int[] vars,Term term){
- //vars[STACK_SIZE]++;
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case GROUP_IN:{
- Term b=new Branch();
- b.failNext=term.in;
- b.in=b;
- b.out=b;
- b.out1=term.out;
- b.branchOut=null;
- return b;
- }
- case REG:
- default:{
- Term b=new Branch();
- b.failNext=term;
- b.in=b;
- b.out=b;
- b.out1=term;
- b.branchOut=null;
- return b;
- }
- }
- }
- private final static Term makeGreedyLimits(int[] vars,Term term,int[] limits,List iterators) throws PatternSyntaxException{
- //vars[STACK_SIZE]++;
- int m=limits[0];
- int n=limits[1];
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case GROUP_IN:{
- int cntreg=vars[CNTREG_COUNT]++;
- Term reset=new Term(TermType.CR_SET_0);
- reset.cntreg=cntreg;
- Term b=new Term(TermType.BRANCH);
- Term inc=new Term(TermType.CRSTORE_CRINC);
- inc.cntreg=cntreg;
- reset.next=b;
- if(n>=0){
- Term lt=new Term(TermType.CR_LT);
- lt.cntreg=cntreg;
- lt.maxCount=n;
- b.next=lt;
- lt.next=term.in;
- }
- else{
- b.next=term.in;
- }
- term.out.next=inc;
- inc.next=b;
- if(m>=0){
- Term gt=new Term(TermType.CR_GT_EQ);
- gt.cntreg=cntreg;
- gt.maxCount=m;
- b.failNext=gt;
- reset.in=reset;
- reset.out=gt;
- reset.out1=null;
- reset.branchOut=null;
- }
- else{
- reset.in=reset;
- reset.out=null;
- reset.out1=null;
- reset.branchOut=b;
- }
- return reset;
- }
- default:{
- return new Iterator(term,limits[0],limits[1],iterators);
- }
- }
- }
- private final static Term makeLazyLimits(int[] vars,Term term,int[] limits){
- //vars[STACK_SIZE]++;
- int m=limits[0];
- int n=limits[1];
- switch(term.type){
- case REPEAT_0_INF:
- case REPEAT_MIN_INF:
- case REPEAT_MIN_MAX:
- case REPEAT_REG_MIN_INF:
- case REPEAT_REG_MIN_MAX:
- case GROUP_IN:{
- int cntreg=vars[CNTREG_COUNT]++;
- Term reset=new Term(TermType.CR_SET_0);
- reset.cntreg=cntreg;
- Term b=new Term(TermType.BRANCH);
- Term inc=new Term(TermType.CRSTORE_CRINC);
- inc.cntreg=cntreg;
- reset.next=b;
- if(n>=0){
- Term lt=new Term(TermType.CR_LT);
- lt.cntreg=cntreg;
- lt.maxCount=n;
- b.failNext=lt;
- lt.next=term.in;
- }
- else{
- b.failNext=term.in;
- }
- term.out.next=inc;
- inc.next=b;
- if(m>=0){
- Term gt=new Term(TermType.CR_GT_EQ);
- gt.cntreg=cntreg;
- gt.maxCount=m;
- b.next=gt;
- reset.in=reset;
- reset.out=gt;
- reset.out1=null;
- reset.branchOut=null;
- return reset;
- }
- else{
- reset.in=reset;
- reset.out=b;
- reset.out1=null;
- reset.branchOut=null;
- return reset;
- }
- }
- case REG:
- default:{
- Term reset=new Term(TermType.CNT_SET_0);
- Term b=new Branch(TermType.BRANCH_STORE_CNT);
- Term inc=new Term(TermType.CNT_INC);
- reset.next=b;
- if(n>=0){
- Term lt=new Term(TermType.READ_CNT_LT);
- lt.maxCount=n;
- b.failNext=lt;
- lt.next=term;
- term.next=inc;
- inc.next=b;
- }
- else{
- b.next=term;
- term.next=inc;
- inc.next=term;
- }
- if(m>=0){
- Term gt=new Term(TermType.CNT_GT_EQ);
- gt.maxCount=m;
- b.next=gt;
- reset.in=reset;
- reset.out=gt;
- reset.out1=null;
- reset.branchOut=null;
- return reset;
- }
- else{
- reset.in=reset;
- reset.out=b;
- reset.out1=null;
- reset.branchOut=null;
- return reset;
- }
- }
- }
- }
- private final int parseTerm(char[] data, int i, int out, Term term,
- int flags) throws PatternSyntaxException{
- char c=data[i++];
- boolean inv=false;
- switch(c){
- case '[':
- return CharacterClass.parseClass(data,i,out,term,(flags&IGNORE_CASE)>0,(flags&IGNORE_SPACES)>0,(flags&UNICODE)>0,(flags&XML_SCHEMA)>0);
- case '.':
- term.type=(flags&DOTALL)>0? TermType.ANY_CHAR: TermType.ANY_CHAR_NE;
- break;
- case '$':
- //term.type=mods[MULTILINE_IND]? LINE_END: END; //??
- term.type=(flags&MULTILINE)>0? TermType.LINE_END: TermType.END_EOL;
- break;
- case '^':
- term.type=(flags&MULTILINE)>0? TermType.LINE_START: TermType.START;
- break;
- case '\\':
- if(i>=out) throw new PatternSyntaxException("Escape without a character");
- c=data[i++];
- esc: switch(c){
- case 'f':
- c='\f'; // form feed
- break;
- case 'n':
- c='\n'; // new line
- break;
- case 'r':
- c='\r'; // carriage return
- break;
- case 't':
- c='\t'; // tab
- break;
- case '\\':
- c='\\';
- break;
- case 'u':
- if(i+4 >= out) throw new PatternSyntaxException("To few characters for u-escape");
- c=(char)((CharacterClass.toHexDigit(data[i++])<<12)+
- (CharacterClass.toHexDigit(data[i++])<<8)+
- (CharacterClass.toHexDigit(data[i++])<<4)+
- CharacterClass.toHexDigit(data[i++]));
- break;
- case 'v':
- if(i+6 >= out) throw new PatternSyntaxException("To few characters for u-escape");
- c=(char)((CharacterClass.toHexDigit(data[i++])<<24)+
- (CharacterClass.toHexDigit(data[i++])<<16)+
- (CharacterClass.toHexDigit(data[i++])<<12)+
- (CharacterClass.toHexDigit(data[i++])<<8)+
- (CharacterClass.toHexDigit(data[i++])<<4)+
- CharacterClass.toHexDigit(data[i++]));
- break;
- case 'x':{ // hex 2-digit number -> char
- if(i >= out) throw new PatternSyntaxException("To few characters for x-escape");
- int hex=0;
- char d;
- if((d=data[i++])=='{'){
- while(i<out && (d=data[i++])!='}'){
- hex=(hex<<4)+CharacterClass.toHexDigit(d);
- if(hex>0xffff) throw new PatternSyntaxException("\\x{<out of range>}");
- }
- }
- else{
- if(i >= out) throw new PatternSyntaxException("To few characters for x-escape");
- hex=(CharacterClass.toHexDigit(d)<<4)+
- CharacterClass.toHexDigit(data[i++]);
- }
- c=(char)hex;
- break;
- }
- case '0':
- case 'o': // oct 2- or 3-digit number -> char
- int oct=0;
- for(;;){
- char d=data[i];
- if(d>='0' && d<='7'){
- i++;
- oct*=8;
- oct+=d-'0';
- if(oct>0xffff) break;
- if(i>=out) break;
- }
- else break;
- }
- c=(char)oct;
- break;
- case 'm': // decimal number -> char
- int dec=0;
- for(;;){
- char d=data[i++];
- if(d>='0' && d<='9'){
- dec*=10;
- dec+=d-'0';
- if(dec>0xffff) break;
- if(i>=out) break;
- }
- else break;
- }
- i--;
- c=(char)dec;
- break;
- case 'c': // ctrl-char
- c=(char)(data[i++]&0x1f);
- break;
- case 'D': // non-digit
- inv=true;
- // go on
- case 'd': // digit
- CharacterClass.makeDigit(term,inv,(flags&UNICODE)>0);
- return i;
- case 'S': // non-space
- inv=true;
- // go on
- case 's': // space
- CharacterClass.makeSpace(term,inv,(flags&UNICODE)>0);
- return i;
- case 'W': // non-letter
- inv=true;
- // go on
- case 'w': // letter
- CharacterClass.makeWordChar(term,inv,(flags&UNICODE)>0);
- return i;
- case 'B': // non-(word boundary)
- inv=true;
- // go on
- case 'b': // word boundary
- CharacterClass.makeWordBoundary(term,inv,(flags&UNICODE)>0);
- return i;
- case '<': // non-(word boundary)
- CharacterClass.makeWordStart(term,(flags&UNICODE)>0);
- return i;
- case '>': // word boundary
- CharacterClass.makeWordEnd(term,(flags&UNICODE)>0);
- return i;
- case 'A': // text beginning
- term.type=TermType.START;
- return i;
- case 'Z': // text end
- term.type=TermType.END_EOL;
- return i;
- case 'z': // text end
- term.type=TermType.END;
- return i;
- case 'G': // end of last match
- term.type=TermType.LAST_MATCH_END;
- return i;
- case 'P': // \\P{..}
- inv=true;
- case 'p': // \\p{..}
- i=CharacterClass.parseName(data,i,out,term,inv,(flags&IGNORE_SPACES)>0);
- return i;
- default:
- if(c>='1' && c<='9'){
- int n=c-'0';
- while((i<out) && (c=data[i])>='0' && c<='9'){
- n=(n*10)+c-'0';
- i++;
- }
- term.type=(flags&IGNORE_CASE)>0? TermType.REG_I: TermType.REG;
- term.memreg=n;
- return i;
- }
- /*
- if(c<256){
- CustomParser termp=customParsers[c];
- if(termp!=null){
- i=termp.parse(i,data,term);
- return i;
- }
- }
- */
- }
- term.type=TermType.CHAR;
- term.c=c;
- break;
- default:
- if((flags&IGNORE_CASE)==0){
- term.type=TermType.CHAR;
- term.c=c;
- }
- else{
- CharacterClass.makeICase(term,c);
- }
- break;
- }
- return i;
- }
- // one of {n},{n,},{,n},{n1,n2}
- protected static final int parseLimits(int i,int end,char[] data,int[] limits) throws PatternSyntaxException{
- if(limits.length!=LIMITS_LENGTH) throw new IllegalArgumentException("maxTimess.length="+limits.length+", should be 2");
- limits[LIMITS_PARSE_RESULT_INDEX]=LIMITS_OK;
- int ind=0;
- int v=0;
- char c;
- while(i<end){
- c=data[i++];
- switch(c){
- case ' ':
- continue;
- case ',':
- if(ind>0) throw new PatternSyntaxException("illegal construction: {.. , , ..}");
- limits[ind++]=v;
- v=-1;
- continue;
- case '}':
- limits[ind]=v;
- if(ind==0) limits[1]=v;
- return i;
- default:
- if(c>'9' || c<'0'){
- //throw new PatternSyntaxException("illegal symbol in iterator: '{"+c+"}'");
- limits[LIMITS_PARSE_RESULT_INDEX]=LIMITS_FAILURE;
- return i;
- }
- if(v<0) v=0;
- v= v*10 + (c-'0');
- }
- }
- throw new PatternSyntaxException("malformed quantifier");
- }
- public String toString(){
- StringBuffer b=new StringBuffer(100);
- b.append(instanceNum);
- b.append(": ");
- if(inverse) b.append('^');
- switch(type){
- case VOID:
- b.append("[]");
- b.append(" , ");
- break;
- case CHAR:
- b.append(CharacterClass.stringValue(c));
- b.append(" , ");
- break;
- case ANY_CHAR:
- b.append("dotall, ");
- break;
- case ANY_CHAR_NE:
- b.append("dot-eols, ");
- break;
- case BITSET:
- b.append('[');
- b.append(CharacterClass.stringValue0(bitset));
- b.append(']');
- b.append(" , weight=");
- b.append(weight);
- b.append(" , ");
- break;
- case BITSET2:
- b.append('[');
- b.append(CharacterClass.stringValue2(bitset2));
- b.append(']');
- b.append(" , weight=");
- b.append(weight);
- b.append(" , ");
- break;
- case START:
- b.append("abs.start");
- break;
- case END:
- b.append("abs.end");
- break;
- case END_EOL:
- b.append("abs.end-eol");
- break;
- case LINE_START:
- b.append("line start");
- break;
- case LINE_END:
- b.append("line end");
- break;
- case LAST_MATCH_END:
- if(inverse)b.append("non-");
- b.append("BOUNDARY");
- break;
- case BOUNDARY:
- if(inverse)b.append("non-");
- b.append("BOUNDARY");
- break;
- case UBOUNDARY:
- if(inverse)b.append("non-");
- b.append("UBOUNDARY");
- break;
- case DIRECTION:
- b.append("DIRECTION");
- break;
- case UDIRECTION:
- b.append("UDIRECTION");
- break;
- case FIND:
- b.append(">>>{");
- b.append(target);
- b.append("}, <<");
- b.append(distance);
- if(eat){
- b.append(",eat");
- }
- b.append(", ");
- break;
- case REPEAT_0_INF:
- b.append("rpt{");
- b.append(target);
- b.append(",0,inf}");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case REPEAT_MIN_INF:
- b.append("rpt{");
- b.append(target);
- b.append(",");
- b.append(minCount);
- b.append(",inf}");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case REPEAT_MIN_MAX:
- b.append("rpt{");
- b.append(target);
- b.append(",");
- b.append(minCount);
- b.append(",");
- b.append(maxCount);
- b.append("}");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case REPEAT_REG_MIN_INF:
- b.append("rpt{$");
- b.append(memreg);
- b.append(',');
- b.append(minCount);
- b.append(",inf}");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case REPEAT_REG_MIN_MAX:
- b.append("rpt{$");
- b.append(memreg);
- b.append(',');
- b.append(minCount);
- b.append(',');
- b.append(maxCount);
- b.append("}");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case BACKTRACK_0:
- b.append("back(0)");
- break;
- case BACKTRACK_MIN:
- b.append("back(");
- b.append(minCount);
- b.append(")");
- break;
- case BACKTRACK_REG_MIN:
- b.append("back");
- b.append("_$");
- b.append(memreg);
- b.append("(");
- b.append(minCount);
- b.append(")");
- break;
- case GROUP_IN:
- b.append('(');
- if(memreg>0)b.append(memreg);
- b.append('-');
- b.append(" , ");
- break;
- case GROUP_OUT:
- b.append('-');
- if(memreg>0)b.append(memreg);
- b.append(')');
- b.append(" , ");
- break;
- case PLOOKAHEAD_IN:
- b.append('(');
- b.append("=");
- b.append(lookaheadId);
- b.append(" , ");
- break;
- case PLOOKAHEAD_OUT:
- b.append('=');
- b.append(lookaheadId);
- b.append(')');
- b.append(" , ");
- break;
- case NLOOKAHEAD_IN:
- b.append("(!");
- b.append(lookaheadId);
- b.append(" , ");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case NLOOKAHEAD_OUT:
- b.append('!');
- b.append(lookaheadId);
- b.append(')');
- b.append(" , ");
- break;
- case PLOOKBEHIND_IN:
- b.append('(');
- b.append("<=");
- b.append(lookaheadId);
- b.append(" , dist=");
- b.append(distance);
- b.append(" , ");
- break;
- case PLOOKBEHIND_OUT:
- b.append("<=");
- b.append(lookaheadId);
- b.append(')');
- b.append(" , ");
- break;
- case NLOOKBEHIND_IN:
- b.append("(<!");
- b.append(lookaheadId);
- b.append(" , dist=");
- b.append(distance);
- b.append(" , ");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case NLOOKBEHIND_OUT:
- b.append("<!");
- b.append(lookaheadId);
- b.append(')');
- b.append(" , ");
- break;
- case MEMREG_CONDITION:
- b.append("(reg");
- b.append(memreg);
- b.append("?)");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case LOOKAHEAD_CONDITION_IN:
- b.append("(cond");
- b.append(lookaheadId);
- b.append(((Lookahead)this).isPositive? '=': '!');
- b.append(" , ");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case LOOKAHEAD_CONDITION_OUT:
- b.append("cond");
- b.append(lookaheadId);
- b.append(")");
- if(failNext!=null){
- b.append(", =>");
- b.append(failNext.instanceNum);
- b.append(", ");
- }
- break;
- case REG:
- b.append("$");
- b.append(memreg);
- b.append(", ");
- break;
- case SUCCESS:
- b.append("END");
- break;
- case BRANCH_STORE_CNT_AUX1:
- b.append("(aux1)");
- case BRANCH_STORE_CNT:
- b.append("(cnt)");
- case BRANCH:
- b.append("=>");
- if(failNext!=null) b.append(failNext.instanceNum);
- else b.append("null");
- b.append(" , ");
- break;
- default:
- b.append('[');
- switch(type){
- case CNT_SET_0:
- b.append("cnt=0");
- …
Large files files are truncated, but you can click here to view the full file