PageRenderTime 56ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/src/ikj/main/org/jregex/Term.java

https://github.com/olabini/ioke
Java | 2184 lines | 1768 code | 216 blank | 200 comment | 145 complexity | 15c7d248b934c305aaa85190dc8ee590 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /**
  2. * Copyright (c) 2001, Sergey A. Samokhodkin
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without modification,
  6. * are permitted provided that the following conditions are met:
  7. *
  8. * - Redistributions of source code must retain the above copyright notice,
  9. * this list of conditions and the following disclaimer.
  10. * - Redistributions in binary form
  11. * must reproduce the above copyright notice, this list of conditions and the following
  12. * disclaimer in the documentation and/or other materials provided with the distribution.
  13. * - Neither the name of jregex nor the names of its contributors may be used
  14. * to endorse or promote products derived from this software without specific prior
  15. * written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
  18. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20. * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  21. * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  23. * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  25. * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *
  27. * @version 1.2_01
  28. */
  29. package org.jregex;
  30. import java.util.*;
  31. class Term implements REFlags{
  32. public enum TermType {
  33. //runtime Term types
  34. CHAR, BITSET, BITSET2, ANY_CHAR, ANY_CHAR_NE,
  35. REG, REG_I, FIND, FINDREG, SUCCESS,
  36. /*optimization-transparent types*/
  37. BOUNDARY, DIRECTION, UBOUNDARY, UDIRECTION,
  38. GROUP_IN, GROUP_OUT, VOID,
  39. START, END, END_EOL, LINE_START, LINE_END, LAST_MATCH_END,
  40. CNT_SET_0, CNT_INC, CNT_GT_EQ, READ_CNT_LT,
  41. // CTSTORE_CRINC: store on 'actual' search entry
  42. CRSTORE_CRINC, CR_SET_0, CR_LT, CR_GT_EQ,
  43. /*optimization-nontransparent types*/
  44. BRANCH, BRANCH_STORE_CNT, BRANCH_STORE_CNT_AUX1,
  45. // INDEPENDENT_IN: functionally the same as NLOOKAHEAD_IN
  46. PLOOKAHEAD_IN, PLOOKAHEAD_OUT, NLOOKAHEAD_IN, NLOOKAHEAD_OUT, PLOOKBEHIND_IN,
  47. PLOOKBEHIND_OUT, NLOOKBEHIND_IN, NLOOKBEHIND_OUT, INDEPENDENT_IN, INDEPENDENT_OUT,
  48. REPEAT_0_INF, REPEAT_MIN_INF, REPEAT_MIN_MAX, REPEAT_REG_MIN_INF, REPEAT_REG_MIN_MAX,
  49. BACKTRACK_0, BACKTRACK_MIN, BACKTRACK_FIND_MIN, BACKTRACK_FINDREG_MIN, BACKTRACK_REG_MIN,
  50. MEMREG_CONDITION, LOOKAHEAD_CONDITION_IN, LOOKAHEAD_CONDITION_OUT, LOOKBEHIND_CONDITION_IN,
  51. LOOKBEHIND_CONDITION_OUT
  52. }
  53. // compiletime: length of vars[] (see makeTree())
  54. static final int VARS_LENGTH=4;
  55. // compiletime variable indicies:
  56. private static final int MEMREG_COUNT=0; //refers current memreg index
  57. private static final int CNTREG_COUNT=1; //refers current counters number
  58. private static final int DEPTH=2; //refers current depth: (((depth=3)))
  59. private static final int LOOKAHEAD_COUNT=3; //refers current memreg index
  60. private static final int LIMITS_LENGTH=3;
  61. private static final int LIMITS_PARSE_RESULT_INDEX=2;
  62. private static final int LIMITS_OK=1;
  63. private static final int LIMITS_FAILURE=2;
  64. //static CustomParser[] customParsers=new CustomParser[256];
  65. // **** CONTROL FLOW ****
  66. // next-to-execute and next-if-failed commands;
  67. Term next,failNext;
  68. // **** TYPES ****
  69. TermType type=TermType.VOID;
  70. boolean inverse;
  71. // used with type=CHAR
  72. char c;
  73. // used with type=FIND
  74. int distance;
  75. boolean eat;
  76. // used with type=BITSET(2);
  77. boolean[] bitset;
  78. boolean[][] bitset2;
  79. boolean[] categoryBitset; //types(unicode categories)
  80. // used with type=BALANCE;
  81. char[] brackets;
  82. // used for optimization with type=BITSET,BITSET2
  83. int weight;
  84. // **** MEMORISATION ****
  85. // memory slot, used with type=REG,GROUP_IN,GROUP_OUT
  86. int memreg=-1;
  87. // **** COUNTERS ****
  88. // max|min number of iterations
  89. // used with CNT_GT_EQ ,REPEAT_* etc.;
  90. int minCount,maxCount;
  91. // used with REPEAT_*,REPEAT_REG_*;
  92. Term target;
  93. // a counter slot to increment & compare with maxCount (CNT_INC etc.);
  94. int cntreg=0;
  95. // lookahead group id;
  96. int lookaheadId;
  97. // **** COMPILE HELPERS ****
  98. protected Term prev,in,out,out1,first,current;
  99. //new!!
  100. protected Term branchOut;
  101. //protected boolean newBranch=false,closed=false;
  102. //protected boolean newBranch=false;
  103. //for debugging
  104. static int instances;
  105. int instanceNum;
  106. Term(){
  107. //for debugging
  108. instanceNum=instances;
  109. instances++;
  110. in=out=this;
  111. }
  112. Term(TermType type){
  113. this();
  114. this.type=type;
  115. }
  116. static void makeTree(String s, int flags,Pattern re) throws PatternSyntaxException{
  117. char[] data=s.toCharArray();
  118. makeTree(data,0,data.length,flags,re);
  119. }
  120. static void makeTree(char[] data,int offset,int end,
  121. int flags,Pattern re) throws PatternSyntaxException{
  122. // memreg,counter,depth,lookahead
  123. int[] vars={1,0,0,0}; //don't use counters[0]
  124. //collect iterators for subsequent optimization
  125. List iterators=new ArrayList();
  126. Map groupNames=new LinkedHashMap();
  127. Pretokenizer t=new Pretokenizer(data,offset,end);
  128. Term term=makeTree(t,data,vars,flags,new Group(),iterators,groupNames);
  129. // term=(0-...-0)
  130. // convert closing outer bracket into success term
  131. term.out.type=TermType.SUCCESS;
  132. // term=(0-...-!!!
  133. //throw out opening bracket
  134. Term first=term.next;
  135. // term=...-!!!
  136. // Optimisation:
  137. Term optimized=first;
  138. Optimizer opt=Optimizer.find(first);
  139. if(opt!=null) optimized=opt.makeFirst(first);
  140. java.util.Iterator en=iterators.iterator();
  141. while(en.hasNext()){
  142. Iterator i=(Iterator)en.next();
  143. i.optimize();
  144. }
  145. // ===
  146. re.root=optimized;
  147. re.root0=first;
  148. re.memregs=vars[MEMREG_COUNT];
  149. re.counters=vars[CNTREG_COUNT];
  150. re.lookaheads=vars[LOOKAHEAD_COUNT];
  151. re.namedGroupMap=groupNames;
  152. }
  153. private static Term makeTree(Pretokenizer t,char[] data,int[] vars,
  154. int flags,Term term,List iterators,Map groupNames) throws PatternSyntaxException{
  155. //System.out.println("Term.makeTree(): flags="+flags);
  156. if(vars.length!=VARS_LENGTH) throw new IllegalArgumentException("vars.length should be "+VARS_LENGTH+", not "+vars.length);
  157. //Term term=new Term(isMemReg? vars[MEMREG_COUNT]: -1);
  158. // use memreg 0 as unsignificant
  159. //Term term=new Group(isMemReg? vars[MEMREG_COUNT]: 0);
  160. while(true){
  161. t.next();
  162. term.append(t.tOffset,t.tOutside,data,vars,flags,iterators,groupNames);
  163. switch(t.ttype){
  164. case Pretokenizer.FLAGS:
  165. flags=t.flags(flags);
  166. continue;
  167. case Pretokenizer.CLASS_GROUP:
  168. t.next();
  169. Term clg=new Term();
  170. CharacterClass.parseGroup(data,t.tOffset,t.tOutside,clg,
  171. (flags&IGNORE_CASE)>0, (flags&IGNORE_SPACES)>0,
  172. (flags&UNICODE)>0, (flags&XML_SCHEMA)>0);
  173. term.append(clg);
  174. continue;
  175. case Pretokenizer.PLAIN_GROUP:
  176. vars[DEPTH]++;
  177. //System.out.println("PLAIN_GROUP, t.tOffset="+t.tOffset+", t.tOutside="+t.tOutside+", t.flags("+flags+")="+t.flags(flags));
  178. term.append(makeTree(t,data,vars,t.flags(flags),new Group(),iterators,groupNames));
  179. break;
  180. case Pretokenizer.NAMED_GROUP:
  181. String gname=t.groupName;
  182. int id;
  183. if(Character.isDigit(gname.charAt(0))){
  184. try{
  185. id=Integer.parseInt(gname);
  186. }
  187. catch(NumberFormatException e){
  188. throw new PatternSyntaxException("group name starts with digit but is not a number");
  189. }
  190. if(groupNames.containsValue(new Integer(id))){
  191. if(t.groupDeclared) throw new PatternSyntaxException("group redeclaration: "+gname+"; use ({=id}...) for multiple group assignments");
  192. }
  193. if(vars[MEMREG_COUNT]<=id)vars[MEMREG_COUNT]=id+1;
  194. }
  195. else{
  196. Integer no=(Integer)groupNames.get(gname);
  197. if(no==null){
  198. id=vars[MEMREG_COUNT]++;
  199. groupNames.put(t.groupName,new Integer(id));
  200. }
  201. else{
  202. if(t.groupDeclared) throw new PatternSyntaxException("group redeclaration "+gname+"; use ({=name}...) for group reassignments");
  203. id=no.intValue();
  204. }
  205. }
  206. vars[DEPTH]++;
  207. term.append(makeTree(t,data,vars,flags,new Group(id),iterators,groupNames));
  208. break;
  209. case '(':
  210. vars[DEPTH]++;
  211. term.append(makeTree(t,data,vars,flags,new Group(vars[MEMREG_COUNT]++),iterators,groupNames));
  212. break;
  213. case Pretokenizer.POS_LOOKAHEAD:
  214. vars[DEPTH]++;
  215. term.append(makeTree(t,data,vars,flags,new Lookahead(vars[LOOKAHEAD_COUNT]++,true),iterators,groupNames));
  216. break;
  217. case Pretokenizer.NEG_LOOKAHEAD:
  218. vars[DEPTH]++;
  219. term.append(makeTree(t,data,vars,flags,new Lookahead(vars[LOOKAHEAD_COUNT]++,false),iterators,groupNames));
  220. break;
  221. case Pretokenizer.POS_LOOKBEHIND:
  222. vars[DEPTH]++;
  223. term.append(makeTree(t,data,vars,flags,new Lookbehind(vars[LOOKAHEAD_COUNT]++,true),iterators,groupNames));
  224. break;
  225. case Pretokenizer.NEG_LOOKBEHIND:
  226. vars[DEPTH]++;
  227. term.append(makeTree(t,data,vars,flags,new Lookbehind(vars[LOOKAHEAD_COUNT]++,false),iterators,groupNames));
  228. break;
  229. case Pretokenizer.INDEPENDENT_REGEX:
  230. vars[DEPTH]++;
  231. term.append(makeTree(t,data,vars,flags,new IndependentGroup(vars[LOOKAHEAD_COUNT]++),iterators,groupNames));
  232. break;
  233. case Pretokenizer.CONDITIONAL_GROUP:
  234. vars[DEPTH]++;
  235. t.next();
  236. Term fork=null;
  237. boolean positive=true;
  238. switch(t.ttype){
  239. case Pretokenizer.NEG_LOOKAHEAD:
  240. positive=false;
  241. case Pretokenizer.POS_LOOKAHEAD:
  242. vars[DEPTH]++;
  243. Lookahead la=new Lookahead(vars[LOOKAHEAD_COUNT]++,positive);
  244. makeTree(t,data,vars,flags,la,iterators,groupNames);
  245. fork=new ConditionalExpr(la);
  246. break;
  247. case Pretokenizer.NEG_LOOKBEHIND:
  248. positive=false;
  249. case Pretokenizer.POS_LOOKBEHIND:
  250. vars[DEPTH]++;
  251. Lookbehind lb=new Lookbehind(vars[LOOKAHEAD_COUNT]++,positive);
  252. makeTree(t,data,vars,flags,lb,iterators,groupNames);
  253. fork=new ConditionalExpr(lb);
  254. break;
  255. case '(':
  256. t.next();
  257. if(t.ttype!=')') throw new PatternSyntaxException("malformed condition");
  258. int memregNo;
  259. if(Character.isDigit(data[t.tOffset])) memregNo=makeNumber(t.tOffset,t.tOutside,data);
  260. else{
  261. String gn=new String(data,t.tOffset,t.tOutside-t.tOffset);
  262. Integer gno=(Integer)groupNames.get(gn);
  263. if(gno==null) throw new PatternSyntaxException("unknown group name in conditional expr.: "+gn);
  264. memregNo=gno.intValue();
  265. }
  266. fork=new ConditionalExpr(memregNo);
  267. break;
  268. default:
  269. throw new PatternSyntaxException("malformed conditional expression: "+t.ttype+" '"+(char)t.ttype+"'");
  270. }
  271. term.append(makeTree(t,data,vars,flags,fork,iterators,groupNames));
  272. break;
  273. case '|':
  274. term.newBranch();
  275. break;
  276. case Pretokenizer.END:
  277. if(vars[DEPTH]>0) throw new PatternSyntaxException("unbalanced parenthesis");
  278. term.close();
  279. return term;
  280. case ')':
  281. if(vars[DEPTH]<=0) throw new PatternSyntaxException("unbalanced parenthesis");
  282. term.close();
  283. vars[DEPTH]--;
  284. return term;
  285. case Pretokenizer.COMMENT:
  286. while(t.ttype!=')') t.next();
  287. continue;
  288. default:
  289. throw new PatternSyntaxException("unknown token type: "+t.ttype);
  290. }
  291. }
  292. }
  293. static int makeNumber(int off, int out, char[] data){
  294. int n=0;
  295. for(int i=off;i<out;i++){
  296. int d=data[i]-'0';
  297. if(d<0 || d>9) return -1;
  298. n*=10;
  299. n+=d;
  300. }
  301. return n;
  302. }
  303. protected void append(int offset,int end,char[] data,
  304. int[] vars,int flags,List iterators,Map gmap) throws PatternSyntaxException{
  305. //System.out.println("append("+new String(data,offset,end-offset)+")");
  306. //System.out.println("current="+this.current);
  307. int[] limits=new int[3];
  308. int i=offset;
  309. Term tmp,current=this.current;
  310. while(i<end){
  311. char c=data[i];
  312. boolean greedy=true;
  313. switch(c){
  314. //operations
  315. case '*':
  316. if(current==null) throw new PatternSyntaxException("missing term before *");
  317. i++;
  318. if(i<end){
  319. switch(data[i]) {
  320. case '?':
  321. greedy^=true;
  322. i++;
  323. break;
  324. case '*':
  325. case '+':
  326. throw new PatternSyntaxException("nested *?+ in regexp");
  327. }
  328. }
  329. tmp=greedy? makeGreedyStar(vars,current,iterators):
  330. makeLazyStar(vars,current);
  331. current=replaceCurrent(tmp);
  332. break;
  333. case '+':
  334. if(current==null) throw new PatternSyntaxException("missing term before +");
  335. i++;
  336. if(i<end){
  337. switch(data[i]) {
  338. case '?':
  339. greedy^=true;
  340. i++;
  341. break;
  342. case '*':
  343. case '+':
  344. throw new PatternSyntaxException("nested *?+ in regexp");
  345. }
  346. }
  347. tmp=greedy? makeGreedyPlus(vars,current,iterators):
  348. makeLazyPlus(vars,current);
  349. current=replaceCurrent(tmp);
  350. break;
  351. case '?':
  352. if(current==null) throw new PatternSyntaxException("missing term before ?");
  353. i++;
  354. if(i<end){
  355. switch(data[i]) {
  356. case '?':
  357. greedy^=true;
  358. i++;
  359. break;
  360. case '*':
  361. case '+':
  362. throw new PatternSyntaxException("nested *?+ in regexp");
  363. }
  364. }
  365. tmp=greedy? makeGreedyQMark(vars,current):
  366. makeLazyQMark(vars,current);
  367. current=replaceCurrent(tmp);
  368. break;
  369. case '{':
  370. limits[0]=0;
  371. limits[1]=-1;
  372. int le=parseLimits(i+1,end,data,limits);
  373. if(limits[LIMITS_PARSE_RESULT_INDEX]==LIMITS_OK){ //parse ok
  374. if(current==null) throw new PatternSyntaxException("missing term before {}");
  375. i=le;
  376. if(i<end && data[i]=='?'){
  377. greedy^=true;
  378. i++;
  379. }
  380. tmp=greedy? makeGreedyLimits(vars,current,limits,iterators):
  381. makeLazyLimits(vars,current,limits);
  382. current=replaceCurrent(tmp);
  383. break;
  384. }
  385. else{ //unicode class or named backreference
  386. if(data[i+1]=='\\'){ //'{\name}' - backreference
  387. int p=i+2;
  388. if(p==end) throw new PatternSyntaxException("'group_id' expected");
  389. while(Character.isWhitespace(data[p])){
  390. p++;
  391. if(p==end) throw new PatternSyntaxException("'group_id' expected");
  392. }
  393. BackReference br=new BackReference(-1,(flags&IGNORE_CASE)>0);
  394. i=parseGroupId(data,p,end,br,gmap);
  395. current=append(br);
  396. continue;
  397. }
  398. else{
  399. Term t=new Term();
  400. i=CharacterClass.parseName(data,i,end,t,false,(flags&IGNORE_SPACES)>0);
  401. current=append(t);
  402. continue;
  403. }
  404. }
  405. case ' ':
  406. case '\t':
  407. case '\r':
  408. case '\n':
  409. if((flags&IGNORE_SPACES)>0){
  410. i++;
  411. continue;
  412. }
  413. //else go on as default
  414. //symbolic items
  415. default:
  416. tmp=new Term();
  417. i=parseTerm(data,i,end,tmp,flags);
  418. if(tmp.type==TermType.END && i<end){
  419. if((flags&IGNORE_SPACES)>0) {
  420. i++;
  421. while(i<end) {
  422. c=data[i];
  423. switch(c){
  424. case ' ':
  425. case '\t':
  426. case '\r':
  427. case '\n':
  428. i++;
  429. continue;
  430. default:
  431. throw new PatternSyntaxException("'$' is not a last term in the group: <"+new String(data,offset,end-offset)+">");
  432. }
  433. }
  434. } else {
  435. throw new PatternSyntaxException("'$' is not a last term in the group: <"+new String(data,offset,end-offset)+">");
  436. }
  437. }
  438. //"\A"
  439. //if(tmp.type==START && i>(offset+1)){
  440. // throw new PatternSyntaxException("'^' is not a first term in the group: <"+new String(data,offset,end-offset)+">");
  441. //}
  442. current=append(tmp);
  443. break;
  444. }
  445. //System.out.println("next term: "+next);
  446. //System.out.println(" next.out="+next.out);
  447. //System.out.println(" next.out1="+next.out1);
  448. //System.out.println(" next.branchOut="+next.branchOut);
  449. }
  450. //System.out.println(in.toStringAll());
  451. //System.out.println("current="+current);
  452. //System.out.println();
  453. }
  454. private static int parseGroupId(char[] data, int i, int end, Term term, Map gmap) throws PatternSyntaxException{
  455. int id;
  456. int nstart=i;
  457. if(Character.isDigit(data[i])){
  458. while(Character.isDigit(data[i])){
  459. i++;
  460. if(i==end) throw new PatternSyntaxException("group_id expected");
  461. }
  462. id=makeNumber(nstart,i,data);
  463. }
  464. else{
  465. while(Character.isJavaIdentifierPart(data[i])){
  466. i++;
  467. if(i==end) throw new PatternSyntaxException("group_id expected");
  468. }
  469. String s=new String(data,nstart,i-nstart);
  470. Integer no=(Integer)gmap.get(s);
  471. if(no==null)throw new PatternSyntaxException("backreference to unknown group: "+s);
  472. id=no.intValue();
  473. }
  474. while(Character.isWhitespace(data[i])){
  475. i++;
  476. if(i==end) throw new PatternSyntaxException("'}' expected");
  477. }
  478. int c=data[i++];
  479. if(c!='}') throw new PatternSyntaxException("'}' expected");
  480. term.memreg=id;
  481. return i;
  482. }
  483. protected Term append(Term term) throws PatternSyntaxException{
  484. //System.out.println("append("+term.toStringAll()+"), this="+toStringAll());
  485. //Term prev=this.prev;
  486. Term current=this.current;
  487. if(current==null){
  488. //System.out.println("2");
  489. //System.out.println(" term="+term);
  490. //System.out.println(" term.in="+term.in);
  491. in.next=term;
  492. term.prev=in;
  493. this.current=term;
  494. //System.out.println(" result: "+in.toStringAll()+"\r\n");
  495. return term;
  496. }
  497. //System.out.println("3");
  498. link(current,term);
  499. //this.prev=current;
  500. this.current=term;
  501. //System.out.println(in.toStringAll());
  502. //System.out.println("current="+this.current);
  503. //System.out.println();
  504. return term;
  505. }
  506. protected Term replaceCurrent(Term term) throws PatternSyntaxException{
  507. //System.out.println("replaceCurrent("+term+"), current="+current+", current.prev="+current.prev);
  508. //Term prev=this.prev;
  509. Term prev=current.prev;
  510. if(prev!=null){
  511. Term in=this.in;
  512. if(prev==in){
  513. //in.next=term;
  514. //term.prev=in;
  515. in.next=term.in;
  516. term.in.prev=in;
  517. }
  518. else link(prev,term);
  519. }
  520. this.current=term;
  521. //System.out.println(" new current="+this.current);
  522. return term;
  523. }
  524. protected void newBranch() throws PatternSyntaxException{
  525. //System.out.println("newBranch()");
  526. close();
  527. startNewBranch();
  528. //System.out.println(in.toStringAll());
  529. //System.out.println("current="+current);
  530. //System.out.println();
  531. }
  532. protected void close() throws PatternSyntaxException{
  533. //System.out.println("close(), current="+current+", this="+toStringAll());
  534. //System.out.println();
  535. //System.out.println("close()");
  536. //System.out.println("current="+this.current);
  537. //System.out.println("prev="+this.prev);
  538. //System.out.println();
  539. /*
  540. Term prev=this.prev;
  541. if(prev!=null){
  542. Term current=this.current;
  543. if(current!=null){
  544. link(prev,current);
  545. prev=current;
  546. this.current=null;
  547. }
  548. link(prev,out);
  549. this.prev=null;
  550. }
  551. */
  552. Term current=this.current;
  553. if(current!=null) linkd(current,out);
  554. else in.next=out;
  555. //System.out.println(in.toStringAll());
  556. //System.out.println("current="+this.current);
  557. //System.out.println("prev="+this.prev);
  558. //System.out.println();
  559. }
  560. private final static void link(Term term,Term next){
  561. linkd(term,next.in);
  562. next.prev=term;
  563. }
  564. private final static void linkd(Term term,Term next){
  565. //System.out.println("linkDirectly(\""+term+"\" -> \""+next+"\")");
  566. Term prev_out=term.out;
  567. if(prev_out!=null){
  568. //System.out.println(" prev_out="+prev_out);
  569. prev_out.next=next;
  570. }
  571. Term prev_out1=term.out1;
  572. if(prev_out1!=null){
  573. //System.out.println(" prev_out1="+prev_out1);
  574. prev_out1.next=next;
  575. }
  576. Term prev_branch=term.branchOut;
  577. if(prev_branch!=null){
  578. //System.out.println(" prev_branch="+prev_branch);
  579. prev_branch.failNext=next;
  580. }
  581. }
  582. protected void startNewBranch() throws PatternSyntaxException{
  583. //System.out.println("newBranch()");
  584. //System.out.println("before startNewBranch(), this="+toStringAll());
  585. //System.out.println();
  586. Term tmp=in.next;
  587. Term b=new Branch();
  588. in.next=b;
  589. b.next=tmp;
  590. b.in=null;
  591. b.out=null;
  592. b.out1=null;
  593. b.branchOut=b;
  594. current=b;
  595. //System.out.println("startNewBranch(), this="+toStringAll());
  596. //System.out.println();
  597. }
  598. private final static Term makeGreedyStar(int[] vars,Term term,List iterators) throws PatternSyntaxException{
  599. //vars[STACK_SIZE]++;
  600. switch(term.type){
  601. case REPEAT_0_INF:
  602. case REPEAT_MIN_INF:
  603. case REPEAT_MIN_MAX:
  604. case REPEAT_REG_MIN_INF:
  605. case REPEAT_REG_MIN_MAX:
  606. case INDEPENDENT_IN:
  607. case GROUP_IN:{
  608. Term b=new Branch();
  609. b.next=term.in;
  610. term.out.next=b;
  611. b.in=b;
  612. b.out=null;
  613. b.out1=null;
  614. b.branchOut=b;
  615. return b;
  616. }
  617. default:{
  618. Iterator i=new Iterator(term,0,-1,iterators);
  619. return i;
  620. }
  621. }
  622. }
  623. private final static Term makeLazyStar(int[] vars,Term term){
  624. //vars[STACK_SIZE]++;
  625. switch(term.type){
  626. case REPEAT_0_INF:
  627. case REPEAT_MIN_INF:
  628. case REPEAT_MIN_MAX:
  629. case REPEAT_REG_MIN_INF:
  630. case REPEAT_REG_MIN_MAX:
  631. case GROUP_IN:{
  632. Term b=new Branch();
  633. b.failNext=term.in;
  634. term.out.next=b;
  635. b.in=b;
  636. b.out=b;
  637. b.out1=null;
  638. b.branchOut=null;
  639. return b;
  640. }
  641. default:{
  642. Term b=new Branch();
  643. b.failNext=term;
  644. term.next=b;
  645. b.in=b;
  646. b.out=b;
  647. b.out1=null;
  648. b.branchOut=null;
  649. return b;
  650. }
  651. }
  652. }
  653. private final static Term makeGreedyPlus(int[] vars,Term term,List iterators) throws PatternSyntaxException{
  654. //vars[STACK_SIZE]++;
  655. switch(term.type){
  656. case REPEAT_0_INF:
  657. case REPEAT_MIN_INF:
  658. case REPEAT_MIN_MAX:
  659. case REPEAT_REG_MIN_INF:
  660. case REPEAT_REG_MIN_MAX:
  661. case INDEPENDENT_IN://?
  662. case GROUP_IN:{
  663. //System.out.println("makeGreedyPlus():");
  664. //System.out.println(" in="+term.in);
  665. //System.out.println(" out="+term.out);
  666. Term b=new Branch();
  667. b.next=term.in;
  668. term.out.next=b;
  669. b.in=term.in;
  670. b.out=null;
  671. b.out1=null;
  672. b.branchOut=b;
  673. //System.out.println(" returning "+b.in);
  674. return b;
  675. }
  676. default:{
  677. return new Iterator(term,1,-1,iterators);
  678. }
  679. }
  680. }
  681. private final static Term makeLazyPlus(int[] vars,Term term){
  682. //vars[STACK_SIZE]++;
  683. switch(term.type){
  684. case REPEAT_0_INF:
  685. case REPEAT_MIN_INF:
  686. case REPEAT_MIN_MAX:
  687. case REPEAT_REG_MIN_INF:
  688. case REPEAT_REG_MIN_MAX:
  689. case GROUP_IN:{
  690. Term b=new Branch();
  691. term.out.next=b;
  692. b.failNext=term.in;
  693. b.in=term.in;
  694. b.out=b;
  695. b.out1=null;
  696. b.branchOut=null;
  697. return b;
  698. }
  699. case REG:
  700. default:{
  701. Term b=new Branch();
  702. term.next=b;
  703. b.failNext=term;
  704. b.in=term;
  705. b.out=b;
  706. b.out1=null;
  707. b.branchOut=null;
  708. return b;
  709. }
  710. }
  711. }
  712. private final static Term makeGreedyQMark(int[] vars,Term term){
  713. //vars[STACK_SIZE]++;
  714. switch(term.type){
  715. case REPEAT_0_INF:
  716. case REPEAT_MIN_INF:
  717. case REPEAT_MIN_MAX:
  718. case REPEAT_REG_MIN_INF:
  719. case REPEAT_REG_MIN_MAX:
  720. case GROUP_IN:{
  721. Term b=new Branch();
  722. b.next=term.in;
  723. b.in=b;
  724. b.out=term.out;
  725. b.out1=null;
  726. b.branchOut=b;
  727. return b;
  728. }
  729. case REG:
  730. default:{
  731. Term b=new Branch();
  732. b.next=term;
  733. b.in=b;
  734. b.out=term;
  735. b.out1=null;
  736. b.branchOut=b;
  737. return b;
  738. }
  739. }
  740. }
  741. private final static Term makeLazyQMark(int[] vars,Term term){
  742. //vars[STACK_SIZE]++;
  743. switch(term.type){
  744. case REPEAT_0_INF:
  745. case REPEAT_MIN_INF:
  746. case REPEAT_MIN_MAX:
  747. case REPEAT_REG_MIN_INF:
  748. case REPEAT_REG_MIN_MAX:
  749. case GROUP_IN:{
  750. Term b=new Branch();
  751. b.failNext=term.in;
  752. b.in=b;
  753. b.out=b;
  754. b.out1=term.out;
  755. b.branchOut=null;
  756. return b;
  757. }
  758. case REG:
  759. default:{
  760. Term b=new Branch();
  761. b.failNext=term;
  762. b.in=b;
  763. b.out=b;
  764. b.out1=term;
  765. b.branchOut=null;
  766. return b;
  767. }
  768. }
  769. }
  770. private final static Term makeGreedyLimits(int[] vars,Term term,int[] limits,List iterators) throws PatternSyntaxException{
  771. //vars[STACK_SIZE]++;
  772. int m=limits[0];
  773. int n=limits[1];
  774. switch(term.type){
  775. case REPEAT_0_INF:
  776. case REPEAT_MIN_INF:
  777. case REPEAT_MIN_MAX:
  778. case REPEAT_REG_MIN_INF:
  779. case REPEAT_REG_MIN_MAX:
  780. case GROUP_IN:{
  781. int cntreg=vars[CNTREG_COUNT]++;
  782. Term reset=new Term(TermType.CR_SET_0);
  783. reset.cntreg=cntreg;
  784. Term b=new Term(TermType.BRANCH);
  785. Term inc=new Term(TermType.CRSTORE_CRINC);
  786. inc.cntreg=cntreg;
  787. reset.next=b;
  788. if(n>=0){
  789. Term lt=new Term(TermType.CR_LT);
  790. lt.cntreg=cntreg;
  791. lt.maxCount=n;
  792. b.next=lt;
  793. lt.next=term.in;
  794. }
  795. else{
  796. b.next=term.in;
  797. }
  798. term.out.next=inc;
  799. inc.next=b;
  800. if(m>=0){
  801. Term gt=new Term(TermType.CR_GT_EQ);
  802. gt.cntreg=cntreg;
  803. gt.maxCount=m;
  804. b.failNext=gt;
  805. reset.in=reset;
  806. reset.out=gt;
  807. reset.out1=null;
  808. reset.branchOut=null;
  809. }
  810. else{
  811. reset.in=reset;
  812. reset.out=null;
  813. reset.out1=null;
  814. reset.branchOut=b;
  815. }
  816. return reset;
  817. }
  818. default:{
  819. return new Iterator(term,limits[0],limits[1],iterators);
  820. }
  821. }
  822. }
  823. private final static Term makeLazyLimits(int[] vars,Term term,int[] limits){
  824. //vars[STACK_SIZE]++;
  825. int m=limits[0];
  826. int n=limits[1];
  827. switch(term.type){
  828. case REPEAT_0_INF:
  829. case REPEAT_MIN_INF:
  830. case REPEAT_MIN_MAX:
  831. case REPEAT_REG_MIN_INF:
  832. case REPEAT_REG_MIN_MAX:
  833. case GROUP_IN:{
  834. int cntreg=vars[CNTREG_COUNT]++;
  835. Term reset=new Term(TermType.CR_SET_0);
  836. reset.cntreg=cntreg;
  837. Term b=new Term(TermType.BRANCH);
  838. Term inc=new Term(TermType.CRSTORE_CRINC);
  839. inc.cntreg=cntreg;
  840. reset.next=b;
  841. if(n>=0){
  842. Term lt=new Term(TermType.CR_LT);
  843. lt.cntreg=cntreg;
  844. lt.maxCount=n;
  845. b.failNext=lt;
  846. lt.next=term.in;
  847. }
  848. else{
  849. b.failNext=term.in;
  850. }
  851. term.out.next=inc;
  852. inc.next=b;
  853. if(m>=0){
  854. Term gt=new Term(TermType.CR_GT_EQ);
  855. gt.cntreg=cntreg;
  856. gt.maxCount=m;
  857. b.next=gt;
  858. reset.in=reset;
  859. reset.out=gt;
  860. reset.out1=null;
  861. reset.branchOut=null;
  862. return reset;
  863. }
  864. else{
  865. reset.in=reset;
  866. reset.out=b;
  867. reset.out1=null;
  868. reset.branchOut=null;
  869. return reset;
  870. }
  871. }
  872. case REG:
  873. default:{
  874. Term reset=new Term(TermType.CNT_SET_0);
  875. Term b=new Branch(TermType.BRANCH_STORE_CNT);
  876. Term inc=new Term(TermType.CNT_INC);
  877. reset.next=b;
  878. if(n>=0){
  879. Term lt=new Term(TermType.READ_CNT_LT);
  880. lt.maxCount=n;
  881. b.failNext=lt;
  882. lt.next=term;
  883. term.next=inc;
  884. inc.next=b;
  885. }
  886. else{
  887. b.next=term;
  888. term.next=inc;
  889. inc.next=term;
  890. }
  891. if(m>=0){
  892. Term gt=new Term(TermType.CNT_GT_EQ);
  893. gt.maxCount=m;
  894. b.next=gt;
  895. reset.in=reset;
  896. reset.out=gt;
  897. reset.out1=null;
  898. reset.branchOut=null;
  899. return reset;
  900. }
  901. else{
  902. reset.in=reset;
  903. reset.out=b;
  904. reset.out1=null;
  905. reset.branchOut=null;
  906. return reset;
  907. }
  908. }
  909. }
  910. }
  911. private final int parseTerm(char[] data, int i, int out, Term term,
  912. int flags) throws PatternSyntaxException{
  913. char c=data[i++];
  914. boolean inv=false;
  915. switch(c){
  916. case '[':
  917. return CharacterClass.parseClass(data,i,out,term,(flags&IGNORE_CASE)>0,(flags&IGNORE_SPACES)>0,(flags&UNICODE)>0,(flags&XML_SCHEMA)>0);
  918. case '.':
  919. term.type=(flags&DOTALL)>0? TermType.ANY_CHAR: TermType.ANY_CHAR_NE;
  920. break;
  921. case '$':
  922. //term.type=mods[MULTILINE_IND]? LINE_END: END; //??
  923. term.type=(flags&MULTILINE)>0? TermType.LINE_END: TermType.END_EOL;
  924. break;
  925. case '^':
  926. term.type=(flags&MULTILINE)>0? TermType.LINE_START: TermType.START;
  927. break;
  928. case '\\':
  929. if(i>=out) throw new PatternSyntaxException("Escape without a character");
  930. c=data[i++];
  931. esc: switch(c){
  932. case 'f':
  933. c='\f'; // form feed
  934. break;
  935. case 'n':
  936. c='\n'; // new line
  937. break;
  938. case 'r':
  939. c='\r'; // carriage return
  940. break;
  941. case 't':
  942. c='\t'; // tab
  943. break;
  944. case '\\':
  945. c='\\';
  946. break;
  947. case 'u':
  948. if(i+4 >= out) throw new PatternSyntaxException("To few characters for u-escape");
  949. c=(char)((CharacterClass.toHexDigit(data[i++])<<12)+
  950. (CharacterClass.toHexDigit(data[i++])<<8)+
  951. (CharacterClass.toHexDigit(data[i++])<<4)+
  952. CharacterClass.toHexDigit(data[i++]));
  953. break;
  954. case 'v':
  955. if(i+6 >= out) throw new PatternSyntaxException("To few characters for u-escape");
  956. c=(char)((CharacterClass.toHexDigit(data[i++])<<24)+
  957. (CharacterClass.toHexDigit(data[i++])<<16)+
  958. (CharacterClass.toHexDigit(data[i++])<<12)+
  959. (CharacterClass.toHexDigit(data[i++])<<8)+
  960. (CharacterClass.toHexDigit(data[i++])<<4)+
  961. CharacterClass.toHexDigit(data[i++]));
  962. break;
  963. case 'x':{ // hex 2-digit number -> char
  964. if(i >= out) throw new PatternSyntaxException("To few characters for x-escape");
  965. int hex=0;
  966. char d;
  967. if((d=data[i++])=='{'){
  968. while(i<out && (d=data[i++])!='}'){
  969. hex=(hex<<4)+CharacterClass.toHexDigit(d);
  970. if(hex>0xffff) throw new PatternSyntaxException("\\x{<out of range>}");
  971. }
  972. }
  973. else{
  974. if(i >= out) throw new PatternSyntaxException("To few characters for x-escape");
  975. hex=(CharacterClass.toHexDigit(d)<<4)+
  976. CharacterClass.toHexDigit(data[i++]);
  977. }
  978. c=(char)hex;
  979. break;
  980. }
  981. case '0':
  982. case 'o': // oct 2- or 3-digit number -> char
  983. int oct=0;
  984. for(;;){
  985. char d=data[i];
  986. if(d>='0' && d<='7'){
  987. i++;
  988. oct*=8;
  989. oct+=d-'0';
  990. if(oct>0xffff) break;
  991. if(i>=out) break;
  992. }
  993. else break;
  994. }
  995. c=(char)oct;
  996. break;
  997. case 'm': // decimal number -> char
  998. int dec=0;
  999. for(;;){
  1000. char d=data[i++];
  1001. if(d>='0' && d<='9'){
  1002. dec*=10;
  1003. dec+=d-'0';
  1004. if(dec>0xffff) break;
  1005. if(i>=out) break;
  1006. }
  1007. else break;
  1008. }
  1009. i--;
  1010. c=(char)dec;
  1011. break;
  1012. case 'c': // ctrl-char
  1013. c=(char)(data[i++]&0x1f);
  1014. break;
  1015. case 'D': // non-digit
  1016. inv=true;
  1017. // go on
  1018. case 'd': // digit
  1019. CharacterClass.makeDigit(term,inv,(flags&UNICODE)>0);
  1020. return i;
  1021. case 'S': // non-space
  1022. inv=true;
  1023. // go on
  1024. case 's': // space
  1025. CharacterClass.makeSpace(term,inv,(flags&UNICODE)>0);
  1026. return i;
  1027. case 'W': // non-letter
  1028. inv=true;
  1029. // go on
  1030. case 'w': // letter
  1031. CharacterClass.makeWordChar(term,inv,(flags&UNICODE)>0);
  1032. return i;
  1033. case 'B': // non-(word boundary)
  1034. inv=true;
  1035. // go on
  1036. case 'b': // word boundary
  1037. CharacterClass.makeWordBoundary(term,inv,(flags&UNICODE)>0);
  1038. return i;
  1039. case '<': // non-(word boundary)
  1040. CharacterClass.makeWordStart(term,(flags&UNICODE)>0);
  1041. return i;
  1042. case '>': // word boundary
  1043. CharacterClass.makeWordEnd(term,(flags&UNICODE)>0);
  1044. return i;
  1045. case 'A': // text beginning
  1046. term.type=TermType.START;
  1047. return i;
  1048. case 'Z': // text end
  1049. term.type=TermType.END_EOL;
  1050. return i;
  1051. case 'z': // text end
  1052. term.type=TermType.END;
  1053. return i;
  1054. case 'G': // end of last match
  1055. term.type=TermType.LAST_MATCH_END;
  1056. return i;
  1057. case 'P': // \\P{..}
  1058. inv=true;
  1059. case 'p': // \\p{..}
  1060. i=CharacterClass.parseName(data,i,out,term,inv,(flags&IGNORE_SPACES)>0);
  1061. return i;
  1062. default:
  1063. if(c>='1' && c<='9'){
  1064. int n=c-'0';
  1065. while((i<out) && (c=data[i])>='0' && c<='9'){
  1066. n=(n*10)+c-'0';
  1067. i++;
  1068. }
  1069. term.type=(flags&IGNORE_CASE)>0? TermType.REG_I: TermType.REG;
  1070. term.memreg=n;
  1071. return i;
  1072. }
  1073. /*
  1074. if(c<256){
  1075. CustomParser termp=customParsers[c];
  1076. if(termp!=null){
  1077. i=termp.parse(i,data,term);
  1078. return i;
  1079. }
  1080. }
  1081. */
  1082. }
  1083. term.type=TermType.CHAR;
  1084. term.c=c;
  1085. break;
  1086. default:
  1087. if((flags&IGNORE_CASE)==0){
  1088. term.type=TermType.CHAR;
  1089. term.c=c;
  1090. }
  1091. else{
  1092. CharacterClass.makeICase(term,c);
  1093. }
  1094. break;
  1095. }
  1096. return i;
  1097. }
  1098. // one of {n},{n,},{,n},{n1,n2}
  1099. protected static final int parseLimits(int i,int end,char[] data,int[] limits) throws PatternSyntaxException{
  1100. if(limits.length!=LIMITS_LENGTH) throw new IllegalArgumentException("maxTimess.length="+limits.length+", should be 2");
  1101. limits[LIMITS_PARSE_RESULT_INDEX]=LIMITS_OK;
  1102. int ind=0;
  1103. int v=0;
  1104. char c;
  1105. while(i<end){
  1106. c=data[i++];
  1107. switch(c){
  1108. case ' ':
  1109. continue;
  1110. case ',':
  1111. if(ind>0) throw new PatternSyntaxException("illegal construction: {.. , , ..}");
  1112. limits[ind++]=v;
  1113. v=-1;
  1114. continue;
  1115. case '}':
  1116. limits[ind]=v;
  1117. if(ind==0) limits[1]=v;
  1118. return i;
  1119. default:
  1120. if(c>'9' || c<'0'){
  1121. //throw new PatternSyntaxException("illegal symbol in iterator: '{"+c+"}'");
  1122. limits[LIMITS_PARSE_RESULT_INDEX]=LIMITS_FAILURE;
  1123. return i;
  1124. }
  1125. if(v<0) v=0;
  1126. v= v*10 + (c-'0');
  1127. }
  1128. }
  1129. throw new PatternSyntaxException("malformed quantifier");
  1130. }
  1131. public String toString(){
  1132. StringBuffer b=new StringBuffer(100);
  1133. b.append(instanceNum);
  1134. b.append(": ");
  1135. if(inverse) b.append('^');
  1136. switch(type){
  1137. case VOID:
  1138. b.append("[]");
  1139. b.append(" , ");
  1140. break;
  1141. case CHAR:
  1142. b.append(CharacterClass.stringValue(c));
  1143. b.append(" , ");
  1144. break;
  1145. case ANY_CHAR:
  1146. b.append("dotall, ");
  1147. break;
  1148. case ANY_CHAR_NE:
  1149. b.append("dot-eols, ");
  1150. break;
  1151. case BITSET:
  1152. b.append('[');
  1153. b.append(CharacterClass.stringValue0(bitset));
  1154. b.append(']');
  1155. b.append(" , weight=");
  1156. b.append(weight);
  1157. b.append(" , ");
  1158. break;
  1159. case BITSET2:
  1160. b.append('[');
  1161. b.append(CharacterClass.stringValue2(bitset2));
  1162. b.append(']');
  1163. b.append(" , weight=");
  1164. b.append(weight);
  1165. b.append(" , ");
  1166. break;
  1167. case START:
  1168. b.append("abs.start");
  1169. break;
  1170. case END:
  1171. b.append("abs.end");
  1172. break;
  1173. case END_EOL:
  1174. b.append("abs.end-eol");
  1175. break;
  1176. case LINE_START:
  1177. b.append("line start");
  1178. break;
  1179. case LINE_END:
  1180. b.append("line end");
  1181. break;
  1182. case LAST_MATCH_END:
  1183. if(inverse)b.append("non-");
  1184. b.append("BOUNDARY");
  1185. break;
  1186. case BOUNDARY:
  1187. if(inverse)b.append("non-");
  1188. b.append("BOUNDARY");
  1189. break;
  1190. case UBOUNDARY:
  1191. if(inverse)b.append("non-");
  1192. b.append("UBOUNDARY");
  1193. break;
  1194. case DIRECTION:
  1195. b.append("DIRECTION");
  1196. break;
  1197. case UDIRECTION:
  1198. b.append("UDIRECTION");
  1199. break;
  1200. case FIND:
  1201. b.append(">>>{");
  1202. b.append(target);
  1203. b.append("}, <<");
  1204. b.append(distance);
  1205. if(eat){
  1206. b.append(",eat");
  1207. }
  1208. b.append(", ");
  1209. break;
  1210. case REPEAT_0_INF:
  1211. b.append("rpt{");
  1212. b.append(target);
  1213. b.append(",0,inf}");
  1214. if(failNext!=null){
  1215. b.append(", =>");
  1216. b.append(failNext.instanceNum);
  1217. b.append(", ");
  1218. }
  1219. break;
  1220. case REPEAT_MIN_INF:
  1221. b.append("rpt{");
  1222. b.append(target);
  1223. b.append(",");
  1224. b.append(minCount);
  1225. b.append(",inf}");
  1226. if(failNext!=null){
  1227. b.append(", =>");
  1228. b.append(failNext.instanceNum);
  1229. b.append(", ");
  1230. }
  1231. break;
  1232. case REPEAT_MIN_MAX:
  1233. b.append("rpt{");
  1234. b.append(target);
  1235. b.append(",");
  1236. b.append(minCount);
  1237. b.append(",");
  1238. b.append(maxCount);
  1239. b.append("}");
  1240. if(failNext!=null){
  1241. b.append(", =>");
  1242. b.append(failNext.instanceNum);
  1243. b.append(", ");
  1244. }
  1245. break;
  1246. case REPEAT_REG_MIN_INF:
  1247. b.append("rpt{$");
  1248. b.append(memreg);
  1249. b.append(',');
  1250. b.append(minCount);
  1251. b.append(",inf}");
  1252. if(failNext!=null){
  1253. b.append(", =>");
  1254. b.append(failNext.instanceNum);
  1255. b.append(", ");
  1256. }
  1257. break;
  1258. case REPEAT_REG_MIN_MAX:
  1259. b.append("rpt{$");
  1260. b.append(memreg);
  1261. b.append(',');
  1262. b.append(minCount);
  1263. b.append(',');
  1264. b.append(maxCount);
  1265. b.append("}");
  1266. if(failNext!=null){
  1267. b.append(", =>");
  1268. b.append(failNext.instanceNum);
  1269. b.append(", ");
  1270. }
  1271. break;
  1272. case BACKTRACK_0:
  1273. b.append("back(0)");
  1274. break;
  1275. case BACKTRACK_MIN:
  1276. b.append("back(");
  1277. b.append(minCount);
  1278. b.append(")");
  1279. break;
  1280. case BACKTRACK_REG_MIN:
  1281. b.append("back");
  1282. b.append("_$");
  1283. b.append(memreg);
  1284. b.append("(");
  1285. b.append(minCount);
  1286. b.append(")");
  1287. break;
  1288. case GROUP_IN:
  1289. b.append('(');
  1290. if(memreg>0)b.append(memreg);
  1291. b.append('-');
  1292. b.append(" , ");
  1293. break;
  1294. case GROUP_OUT:
  1295. b.append('-');
  1296. if(memreg>0)b.append(memreg);
  1297. b.append(')');
  1298. b.append(" , ");
  1299. break;
  1300. case PLOOKAHEAD_IN:
  1301. b.append('(');
  1302. b.append("=");
  1303. b.append(lookaheadId);
  1304. b.append(" , ");
  1305. break;
  1306. case PLOOKAHEAD_OUT:
  1307. b.append('=');
  1308. b.append(lookaheadId);
  1309. b.append(')');
  1310. b.append(" , ");
  1311. break;
  1312. case NLOOKAHEAD_IN:
  1313. b.append("(!");
  1314. b.append(lookaheadId);
  1315. b.append(" , ");
  1316. if(failNext!=null){
  1317. b.append(", =>");
  1318. b.append(failNext.instanceNum);
  1319. b.append(", ");
  1320. }
  1321. break;
  1322. case NLOOKAHEAD_OUT:
  1323. b.append('!');
  1324. b.append(lookaheadId);
  1325. b.append(')');
  1326. b.append(" , ");
  1327. break;
  1328. case PLOOKBEHIND_IN:
  1329. b.append('(');
  1330. b.append("<=");
  1331. b.append(lookaheadId);
  1332. b.append(" , dist=");
  1333. b.append(distance);
  1334. b.append(" , ");
  1335. break;
  1336. case PLOOKBEHIND_OUT:
  1337. b.append("<=");
  1338. b.append(lookaheadId);
  1339. b.append(')');
  1340. b.append(" , ");
  1341. break;
  1342. case NLOOKBEHIND_IN:
  1343. b.append("(<!");
  1344. b.append(lookaheadId);
  1345. b.append(" , dist=");
  1346. b.append(distance);
  1347. b.append(" , ");
  1348. if(failNext!=null){
  1349. b.append(", =>");
  1350. b.append(failNext.instanceNum);
  1351. b.append(", ");
  1352. }
  1353. break;
  1354. case NLOOKBEHIND_OUT:
  1355. b.append("<!");
  1356. b.append(lookaheadId);
  1357. b.append(')');
  1358. b.append(" , ");
  1359. break;
  1360. case MEMREG_CONDITION:
  1361. b.append("(reg");
  1362. b.append(memreg);
  1363. b.append("?)");
  1364. if(failNext!=null){
  1365. b.append(", =>");
  1366. b.append(failNext.instanceNum);
  1367. b.append(", ");
  1368. }
  1369. break;
  1370. case LOOKAHEAD_CONDITION_IN:
  1371. b.append("(cond");
  1372. b.append(lookaheadId);
  1373. b.append(((Lookahead)this).isPositive? '=': '!');
  1374. b.append(" , ");
  1375. if(failNext!=null){
  1376. b.append(", =>");
  1377. b.append(failNext.instanceNum);
  1378. b.append(", ");
  1379. }
  1380. break;
  1381. case LOOKAHEAD_CONDITION_OUT:
  1382. b.append("cond");
  1383. b.append(lookaheadId);
  1384. b.append(")");
  1385. if(failNext!=null){
  1386. b.append(", =>");
  1387. b.append(failNext.instanceNum);
  1388. b.append(", ");
  1389. }
  1390. break;
  1391. case REG:
  1392. b.append("$");
  1393. b.append(memreg);
  1394. b.append(", ");
  1395. break;
  1396. case SUCCESS:
  1397. b.append("END");
  1398. break;
  1399. case BRANCH_STORE_CNT_AUX1:
  1400. b.append("(aux1)");
  1401. case BRANCH_STORE_CNT:
  1402. b.append("(cnt)");
  1403. case BRANCH:
  1404. b.append("=>");
  1405. if(failNext!=null) b.append(failNext.instanceNum);
  1406. else b.append("null");
  1407. b.append(" , ");
  1408. break;
  1409. default:
  1410. b.append('[');
  1411. switch(type){
  1412. case CNT_SET_0:
  1413. b.append("cnt=0");
  1414. break;
  1415. case CNT_INC:
  1416. b.append("cnt++");
  1417. break;
  1418. case CNT_GT_EQ:
  1419. b.append("cnt>="+maxCount);
  1420. break;
  1421. case READ_CNT_LT:
  1422. b.append("->cnt<"+maxCount);
  1423. break;
  1424. case CRSTORE_CRINC:
  1425. b.append("M("+memreg+")->,Cr("+cntreg+")->,Cr("+cntreg+")++");
  1426. break;
  1427. case CR_SET_0:
  1428. b.append("Cr("+cntreg+")=0");
  1429. break;
  1430. case CR_LT:
  1431. b.append("Cr("+cntreg+")<"+maxCount);
  1432. break;
  1433. case CR_GT_EQ:
  1434. b.append("Cr("+cntreg+")>="+maxCount);
  1435. break;
  1436. default:
  1437. b.append("unknown type: "+type);
  1438. }
  1439. b.append("] , ");
  1440. }
  1441. if(next!=null){
  1442. b.append("->");
  1443. b.append(next.instanceNum);
  1444. b.append(", ");
  1445. }
  1446. //b.append("\r\n");
  1447. return b.toString();
  1448. }
  1449. public String toStringAll(){
  1450. return toStringAll(new Vector());
  1451. }
  1452. public String toStringAll(Vector v){
  1453. v.addElement(new Integer(instanceNum));
  1454. String s=toString();
  1455. if(next!=null){
  1456. if(!v.contains(new Integer(next.instanceNum))){
  1457. s+="\r\n";
  1458. s+=next.toStringAll(v);
  1459. }
  1460. }
  1461. if(failNext!=null){
  1462. if(!v.contains(new Integer(failNext.instanceNum))){
  1463. s+="\r\n";
  1464. s+=failNext.toStringAll(v);
  1465. }
  1466. }
  1467. return s;
  1468. }
  1469. }
  1470. class Pretokenizer{
  1471. private static final int START=1;
  1472. static final int END=2;
  1473. static final int PLAIN_GROUP=3;
  1474. static final int POS_LOOKAHEAD=4;
  1475. static final int NEG_LOOKAHEAD=5;
  1476. static final int POS_LOOKBEHIND=6;
  1477. static final int NEG_LOOKBEHIND=7;
  1478. static final int INDEPENDENT_REGEX=8;
  1479. static final int COMMENT=9;
  1480. static final int CONDITIONAL_GROUP=10;
  1481. static final int FLAGS=11;
  1482. static final int CLASS_GROUP=12;
  1483. static final int NAMED_GROUP=13;
  1484. int tOffset,tOutside,skip;
  1485. int offset,end;
  1486. int c;
  1487. int ttype=START;
  1488. char[] data;
  1489. //results
  1490. private int flags;
  1491. private boolean flagsChanged;
  1492. char[] brackets;
  1493. String groupName;
  1494. boolean groupDeclared;
  1495. Pretokenizer(char[] data,int offset,int end){
  1496. if(offset<0 || end>data.length) throw new IndexOutOfBoundsException("offset="+offset+", end="+end+", length="+data.length);
  1497. this.offset=offset;
  1498. this.end=end;
  1499. this.tOffset=offset;
  1500. this.tOutside=offset;
  1501. this.data=data;
  1502. }
  1503. int flags(int def){
  1504. return flagsChanged? flags: def;
  1505. }
  1506. void next() throws PatternSyntaxException{
  1507. int tOffset=this.tOutside;
  1508. int skip=this.skip;
  1509. tOffset+=skip;
  1510. flagsChanged=false;
  1511. int end=this.end;
  1512. char[] data=this.data;
  1513. boolean esc=false;
  1514. for(int i=tOffset;i<end;i++){
  1515. if(esc){
  1516. esc=false;
  1517. continue;
  1518. }
  1519. char c=data[i];
  1520. switch(c){
  1521. case '\\':
  1522. esc=true;
  1523. continue;
  1524. case '|':
  1525. case ')':
  1526. ttype=c;
  1527. this.tOffset=tOffset;
  1528. this.tOutside=i;
  1529. this.skip=1;
  1530. return;
  1531. case '(':
  1532. if(((i+2)<end) && (data[i+1]=='?')){
  1533. char c1=data[i+2];
  1534. switch(c1){
  1535. case ':':
  1536. ttype=PLAIN_GROUP;
  1537. skip=3; // "(?:" - skip 3 chars
  1538. break;
  1539. case '=':
  1540. ttype=POS_LOOKAHEAD;
  1541. skip=3; // "(?="
  1542. break;
  1543. case '!':
  1544. ttype=NEG_LOOKAHEAD;
  1545. skip=3; // "(?!"
  1546. break;
  1547. case '<':
  1548. switch(c1=data[i+3]){
  1549. case '=':
  1550. ttype=POS_LOOKBEHIND;
  1551. skip=4; // "(?<="
  1552. break;
  1553. case '!':
  1554. ttype=NEG_LOOKBEHIND;
  1555. skip=4; // "(?<!"
  1556. break;
  1557. default:
  1558. throw new PatternSyntaxException("invalid character after '(?<' : "+c1);
  1559. }
  1560. break;
  1561. case '>':
  1562. ttype=INDEPENDENT_REGEX;
  1563. skip=3; // "(?>"
  1564. break;
  1565. case '#':
  1566. ttype=COMMENT;
  1567. skip=3; // ="(?#".length, the makeTree() skips the rest by itself
  1568. break;
  1569. case '(':
  1570. ttype=CONDITIONAL_GROUP;
  1571. skip=2; //"(?"+"(..." - skip "(?" (2 chars) and parse condition as a group
  1572. break;
  1573. case '[':
  1574. ttype=CLASS_GROUP;
  1575. skip=2; // "(?"+"[..]+...-...&...)" - skip 2 chars and parse a class group
  1576. break;
  1577. default:
  1578. int mOff,mLen;
  1579. mLoop:
  1580. for(int p=i+2;p<end;p++){
  1581. char c2=data[p];
  1582. switch(c2){
  1583. case '-':
  1584. case 'i':
  1585. case 'm':
  1586. case 's':
  1587. case 'x':
  1588. case 'u':
  1589. case 'X':
  1590. //System.out.println("case '+-imsxuX' ("+c2+")");
  1591. continue mLoop;
  1592. case ':':
  1593. mOff=i+2;
  1594. mLen=p-mOff;
  1595. if(mLen>0){
  1596. flags=Pattern.parseFlags(data,mOff,mLen);
  1597. flagsChanged=true;
  1598. }
  1599. ttype=PLAIN_GROUP;
  1600. skip=mLen+3; // "(?imsx:" mLen=4; skip= "(?".len + ":".len + mLen = 2+1+4=7
  1601. break mLoop;
  1602. case ')':
  1603. flags=Pattern.parseFlags(data,mOff=(i+2),mLen=(p-mOff));
  1604. flagsChanged=true;
  1605. ttype=FLAGS;
  1606. skip=mLen+3; // "(?imsx)" mLen=4, skip="(?".len+")".len+mLen=2+1+4=7
  1607. break mLoop;
  1608. default:
  1609. throw new PatternSyntaxException("wrong char after \"(?\": "+c2);
  1610. }
  1611. }
  1612. break;
  1613. }
  1614. }
  1615. else if(((i+2)<end) && (data[i+1]=='{')){ //parse named group: ({name}....),({=name}....)
  1616. int p=i+2;
  1617. skip=3; //'({' + '}'
  1618. int nstart,nend;
  1619. boolean isDecl;
  1620. c=data[p];
  1621. //System.out.println("NG: p="+p+", c="+c);
  1622. while(Character.isWhitespace(c)){
  1623. c=data[++p];
  1624. skip++;
  1625. if(p==end)throw new PatternSyntaxException("malformed named group");
  1626. }
  1627. if(c=='='){
  1628. isDecl=false;
  1629. c=data[++p];
  1630. skip++;
  1631. if(p==end)throw new PatternSyntaxException("malformed named group");
  1632. }
  1633. else isDecl=true;
  1634. nstart=p;
  1635. while(Character.isJavaIdentifierPart(c)){
  1636. c=data[++p];
  1637. skip++;
  1638. if(p==end)throw new PatternSyntaxException("malformed named group");
  1639. }
  1640. nend=p;
  1641. while(Character.isWhitespace(c)){
  1642. c=data[++p];
  1643. skip++;
  1644. if(p==end)throw new PatternSyntaxException("malformed named group");
  1645. }
  1646. if(c!='}') throw new PatternSyntaxException("'}' expected at "+(p-i)+" in "+new String(data,i,end-i));
  1647. this.groupName=new String(data,nstart,nend-nstart);
  1648. this.groupDeclared=isDecl;
  1649. ttype=NAMED_GROUP;
  1650. }
  1651. else{
  1652. ttype='(';
  1653. skip=1;
  1654. }
  1655. this.tOffset=tOffset;
  1656. this.tOutside=i;
  1657. this.skip=skip;
  1658. return;
  1659. case '[':
  1660. loop:
  1661. for(;;i++){
  1662. if(i==end) throw new PatternSyntaxException("malformed character class");
  1663. char c1=data[i];
  1664. switch(c1){
  1665. case '\\':
  1666. i++;
  1667. continue;
  1668. case ']':
  1669. break loop;
  1670. }
  1671. }
  1672. }
  1673. }
  1674. ttype=END;
  1675. this.tOffset=tOffset;
  1676. this.tOutside=end;
  1677. }
  1678. }
  1679. class Branch extends Term{
  1680. Branch(){
  1681. type=TermType.BRANCH;
  1682. }
  1683. Branch(TermType type){
  1684. switch(type){
  1685. case BRANCH:
  1686. case BRANCH_STORE_CNT:
  1687. case BRANCH_STORE_CNT_AUX1:
  1688. this.type=type;
  1689. break;
  1690. default:
  1691. throw new IllegalArgumentException("not a branch type: "+type);
  1692. }
  1693. }
  1694. }
  1695. class BackReference extends Term{
  1696. BackReference(int no,boolean icase){
  1697. super(icase? TermType.REG_I: TermType.REG);
  1698. memreg=no;
  1699. }
  1700. }
  1701. class Group extends Term{
  1702. Group(){
  1703. this(0);
  1704. }
  1705. Group(int memreg){
  1706. type=TermType.GROUP_IN;
  1707. this.memreg=memreg;
  1708. //used in append()
  1709. current=null;
  1710. in=this;
  1711. prev=null;
  1712. out=new Term();
  1713. out.type=TermType.GROUP_OUT;
  1714. out.memreg=memreg;
  1715. }
  1716. }
  1717. class ConditionalExpr extends Group{
  1718. protected Term node;
  1719. protected boolean newBranchStarted=false;
  1720. protected boolean linkAsBranch=true;
  1721. ConditionalExpr(Lookahead la){
  1722. super(0);
  1723. //System.out.println("ConditionalExpr("+la+")");
  1724. /*
  1725. * This all is rather tricky.
  1726. * See how this types are handled in Matcher.
  1727. * The shortcoming is that we strongly rely upon
  1728. * the internal structure of Lookahead.
  1729. */
  1730. la.in.type=TermType.LOOKAHEAD_CONDITION_IN;
  1731. la.out.type=TermType.LOOKAHEAD_CONDITION_OUT;
  1732. if(la.isPositive){
  1733. node=la.in;
  1734. linkAsBranch=true;
  1735. //empty 2'nd branch
  1736. node.failNext=out;
  1737. }
  1738. else{
  1739. node=la.out;
  1740. linkAsBranch=false;
  1741. //empty 2'nd branch
  1742. node.next=out;
  1743. }
  1744. //node.prev=in;
  1745. //in.next=node;
  1746. la.prev=in;
  1747. in.next=la;
  1748. current=la;
  1749. //current=node;
  1750. }
  1751. ConditionalExpr(Lookbehind lb){
  1752. super(0);
  1753. //System.out.println("ConditionalExpr("+la+")");
  1754. /*
  1755. * This all is rather tricky.
  1756. * See how this types are handled in Matcher.
  1757. * The shortcoming is that we strongly rely upon
  1758. * the internal structure of Lookahead.
  1759. */
  1760. lb.in.type=TermType.LOOKBEHIND_CONDITION_IN;
  1761. lb.out.type=TermType.LOOKBEHIND_CONDITION_OUT;
  1762. if(lb.isPositive){
  1763. node=lb.in;
  1764. linkAsBranch=true;
  1765. //empty 2'nd branch
  1766. node.failNext=out;
  1767. }
  1768. else{
  1769. node=lb.out;
  1770. linkAsBranch=false;
  1771. //empty 2'nd branch
  1772. node.next=out;
  1773. }
  1774. lb.prev=in;
  1775. in.next=lb;
  1776. current=lb;
  1777. //current=node;
  1778. }
  1779. ConditionalExpr(int memreg){
  1780. super(0);
  1781. //System.out.println("ConditionalExpr("+memreg+")");
  1782. Term condition=new Term(TermType.MEMREG_CONDITION);
  1783. condition.memreg=memreg;
  1784. condition.out=condition;
  1785. condition.out1=null;
  1786. condition.branchOut=null;
  1787. //default branch
  1788. condition.failNext=out;
  1789. node=current=condition;
  1790. linkAsBranch=true;
  1791. condition.prev=in;
  1792. in.next=condition;
  1793. current=condition;
  1794. }
  1795. protected void startNewBranch() throws PatternSyntaxException{
  1796. if(newBranchStarted) throw new PatternSyntaxException("attempt to set a 3'd choice in a conditional expr.");
  1797. Term node=this.node;
  1798. node.out1=null;
  1799. if(linkAsBranch){
  1800. node.out=null;
  1801. node.branchOut=node;
  1802. }
  1803. else{
  1804. node.out=node;
  1805. node.branchOut=null;
  1806. }
  1807. newBranchStarted=true;
  1808. //System.out.println("CondGrp.startNewBranch(): current="+current+", this="+this.toStringAll());
  1809. current=node;
  1810. }
  1811. }
  1812. class IndependentGroup extends Term{
  1813. IndependentGroup(int id){
  1814. super(TermType.CHAR);
  1815. in=this;
  1816. out=new Term();
  1817. type=TermType.INDEPENDENT_IN;
  1818. out.type=TermType.INDEPENDENT_OUT;
  1819. lookaheadId=out.lookaheadId=id;
  1820. }
  1821. }
  1822. class Lookahead extends Term{
  1823. final boolean isPositive;
  1824. Lookahead(int id,boolean isPositive){
  1825. this.isPositive=isPositive;
  1826. in=this;
  1827. out=new Term();
  1828. if(isPositive){
  1829. type=TermType.PLOOKAHEAD_IN;
  1830. out.type=TermType.PLOOKAHEAD_OUT;
  1831. }
  1832. else{
  1833. type=TermType.NLOOKAHEAD_IN;
  1834. out.type=TermType.NLOOKAHEAD_OUT;
  1835. branchOut=this;
  1836. }
  1837. lookaheadId=id;
  1838. out.lookaheadId=id;
  1839. }
  1840. }
  1841. class Lookbehind extends Term{
  1842. final boolean isPositive;
  1843. private int prevDistance=-1;
  1844. Lookbehind(int id,boolean isPositive){
  1845. distance=0;
  1846. this.isPositive=isPositive;
  1847. in=this;
  1848. out=new Term();
  1849. if(isPositive){
  1850. type=TermType.PLOOKBEHIND_IN;
  1851. out.type=TermType.PLOOKBEHIND_OUT;
  1852. }
  1853. else{
  1854. type=TermType.NLOOKBEHIND_IN;
  1855. out.type=TermType.NLOOKBEHIND_OUT;
  1856. branchOut=this;
  1857. }
  1858. lookaheadId=id;
  1859. out.lookaheadId=id;
  1860. }
  1861. protected Term append(Term t) throws PatternSyntaxException{
  1862. distance+=length(t);
  1863. return super.append(t);
  1864. }
  1865. protected Term replaceCurrent(Term t) throws PatternSyntaxException{
  1866. distance+=length(t)-length(current);
  1867. return super.replaceCurrent(t);
  1868. }
  1869. private static int length(Term t) throws PatternSyntaxException{
  1870. TermType type=t.type;
  1871. switch(type){
  1872. case CHAR:
  1873. case BITSET:
  1874. case BITSET2:
  1875. case ANY_CHAR:
  1876. case ANY_CHAR_NE:
  1877. return 1;
  1878. case BOUNDARY: case DIRECTION: case UBOUNDARY: case UDIRECTION:
  1879. case GROUP_IN: case GROUP_OUT: case VOID: case START: case END:
  1880. case END_EOL: case LINE_START: case LINE_END: case LAST_MATCH_END:
  1881. case CNT_SET_0: case CNT_INC: case CNT_GT_EQ: case READ_CNT_LT:
  1882. case CRSTORE_CRINC: case CR_SET_0: case CR_LT: case CR_GT_EQ:
  1883. return 0;
  1884. default:
  1885. throw new PatternSyntaxException("variable length element within a lookbehind assertion");
  1886. }
  1887. }
  1888. protected void startNewBranch() throws PatternSyntaxException{
  1889. prevDistance=distance;
  1890. distance=0;
  1891. super.startNewBranch();
  1892. }
  1893. protected void close() throws PatternSyntaxException{
  1894. int pd=prevDistance;
  1895. if(pd>=0){
  1896. if(distance!=pd) throw new PatternSyntaxException("non-equal branch lengths within a lookbehind assertion");
  1897. }
  1898. super.close();
  1899. }
  1900. }
  1901. class Iterator extends Term{
  1902. Iterator(Term term,int min,int max,List collection) throws PatternSyntaxException{
  1903. collection.add(this);
  1904. switch(term.type){
  1905. case CHAR:
  1906. case ANY_CHAR:
  1907. case ANY_CHAR_NE:
  1908. case BITSET:
  1909. case BITSET2:{
  1910. target=term;
  1911. Term back=new Term();
  1912. if(min<=0 && max<0){
  1913. type=TermType.REPEAT_0_INF;
  1914. back.type=TermType.BACKTRACK_0;
  1915. }
  1916. else if(min>0 && max<0){
  1917. type=TermType.REPEAT_MIN_INF;
  1918. back.type=TermType.BACKTRACK_MIN;
  1919. minCount=back.minCount=min;
  1920. }
  1921. else{
  1922. type=TermType.REPEAT_MIN_MAX;
  1923. back.type=TermType.BACKTRACK_MIN;
  1924. minCount=back.minCount=min;
  1925. maxCount=max;
  1926. }
  1927. failNext=back;
  1928. in=this;
  1929. out=this;
  1930. out1=back;
  1931. branchOut=null;
  1932. return;
  1933. }
  1934. case REG:{
  1935. target=term;
  1936. memreg=term.memreg;
  1937. Term back=new Term();
  1938. if(max<0){
  1939. type=TermType.REPEAT_REG_MIN_INF;
  1940. back.type=TermType.BACKTRACK_REG_MIN;
  1941. minCount=back.minCount=min;
  1942. }
  1943. else{
  1944. type=TermType.REPEAT_REG_MIN_MAX;
  1945. back.type=TermType.BACKTRACK_REG_MIN;
  1946. minCount=back.minCount=min;
  1947. maxCount=max;
  1948. }
  1949. failNext=back;
  1950. in=this;
  1951. out=this;
  1952. out1=back;
  1953. branchOut=null;
  1954. return;
  1955. }
  1956. default:
  1957. throw new PatternSyntaxException("can't iterate this type: "+term.type);
  1958. }
  1959. }
  1960. void optimize(){
  1961. //System.out.println("optimizing myself: "+this);
  1962. //BACKTRACK_MIN_REG_FIND
  1963. Term back=failNext;
  1964. Optimizer opt=Optimizer.find(back.next);
  1965. if(opt==null) return;
  1966. failNext=opt.makeBacktrack(back);
  1967. }
  1968. }