PageRenderTime 35ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/MicroFrameworkPK_v4_2/Framework/Core/System/RegularExpressions/Classes/RegexProgram.cs

https://bitbucket.org/pmfsampaio/netmf-lpc
C# | 238 lines | 107 code | 22 blank | 109 comment | 17 complexity | c03f3c77008274302da6fa62c443ebfe MD5 | raw file
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. * Ported to C# for the .Net Micro Framework by <a href="mailto:juliusfriedman@gmail.com">Julius Friedman</a>
  18. * http://netmf.codeplex.com/
  19. *
  20. * A class that holds compiled regular expressions. This is exposed mainly
  21. * for use by the recompile utility (which helps you produce precompiled
  22. * REProgram objects). You should not otherwise need to work directly with
  23. * this class.
  24. *
  25. * @see RE -> (Regex)
  26. * @see RECompiler -> (RegexCompiler)
  27. *
  28. * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
  29. * @version $Id: REProgram.java 518156 2007-03-14 14:31:26Z vgritsenko $
  30. */
  31. using System;
  32. using System.Text;
  33. namespace System.Text.RegularExpressions
  34. {
  35. /// <summary>
  36. /// A class that holds compiled regular expressions. You should not need to work directly with this class.
  37. /// Fields are marked internal so that they can be accesed from Capture, Group and Match as well as other classes in this assembly.
  38. /// </summary>
  39. internal sealed class RegexProgram
  40. {
  41. #region Fields
  42. /// <summary>
  43. /// The compiled regular expression 'program' which can be thought of as opcodes and read as char's
  44. /// </summary>
  45. char[] instruction;
  46. /// <summary>
  47. /// The amount of the instruction buffer in use out of all allocated instructions
  48. /// </summary>
  49. int lenInstruction;
  50. /// <summary>
  51. /// Prefix string optimization, (only performed if present)
  52. /// </summary>
  53. char[] prefix;
  54. /// <summary>
  55. /// Execution flags for the program, certain matches can be optomized by the use or lack there of certain flags.
  56. /// Set when the Instructions property is set or when setInstructions is called.
  57. /// Can be changed by the Regex when Match is executed for advanced operations if necessary as well.
  58. /// </summary>
  59. ProgramOptions flags;
  60. /// <summary>
  61. /// The maximum amount of matches to perform, changed by the Regex when Match is executed
  62. /// </summary>
  63. int maxMatches = -1;
  64. /// <summary>
  65. /// The pattern we are matching against, it cannot be changed unless this program is Recompiled
  66. /// </summary>
  67. internal readonly string pattern;
  68. #endregion
  69. #region Constructor
  70. /// <summary>
  71. /// Constructs a program object from a character array.
  72. /// Internal use only.
  73. /// </summary>
  74. /// <param name="instruction">Instruction array with RegexProgram opcode instructions in it</param>
  75. internal RegexProgram(char[] instruction)
  76. : this(instruction, instruction.Length, string.Empty) { }
  77. /// <summary>
  78. /// Constructs a program object from a character array
  79. /// </summary>
  80. /// <param name="parens">Count of parens in the program</param>
  81. /// <param name="instruction">Character array with RE opcode instructions in it</param>
  82. /// <param name="pattern">The pattern to match against</param>
  83. public RegexProgram(int parens, char[] instruction, string pattern)
  84. : this(instruction, instruction.Length, pattern)
  85. {
  86. this.maxMatches = parens;
  87. }
  88. /// <summary>
  89. /// Constructs a RegexProgram object from a char array which can be though of as RegexOpcode's
  90. /// </summary>
  91. /// <param name="instruction">Character array with RE opcode instructions in it</param>
  92. /// <param name="lenInstruction"> Amount of instruction array in use</param>
  93. /// <param name="pattern">The pattern this RegexProgram matches against</param>
  94. public RegexProgram(char[] instruction, int lenInstruction, string pattern)
  95. {
  96. this.pattern = pattern;
  97. setInstructions(instruction, lenInstruction);
  98. }
  99. #endregion
  100. #region Properties
  101. /// <summary>
  102. /// The string representation of the pattern this RegexProgram matches against
  103. /// </summary>
  104. public string Pattern
  105. {
  106. get { return pattern; }
  107. }
  108. /// <summary>
  109. /// The char array of instuctions which this RegexProgram matches against. It can be though of and read as Regex.Opcode's
  110. /// </summary>
  111. internal char[] Instructions
  112. {
  113. get
  114. {
  115. return instruction;
  116. }
  117. set
  118. {
  119. setInstructions(instruction, instruction.Length);
  120. }
  121. }
  122. /// <summary>
  123. /// Optomization and Matching flags
  124. /// </summary>
  125. internal ProgramOptions Flags
  126. {
  127. get { return flags; }
  128. set { flags = value; }
  129. }
  130. /// <summary>
  131. /// The maximum number of matches to match against
  132. /// </summary>
  133. internal int MaximumMatches
  134. {
  135. get
  136. {
  137. return maxMatches;
  138. }
  139. set { maxMatches = value; }
  140. }
  141. /// <summary>
  142. /// Returns a copy of the prefix of current regular expression program
  143. /// in a character array. If there is no prefix, or there is no program
  144. /// compiled yet, <code>getPrefix</code> will return null.
  145. /// return A copy of the prefix of current compiled RE program
  146. /// </summary>
  147. internal char[] Prefix
  148. {
  149. get { return prefix; }
  150. }
  151. #endregion
  152. /// <summary>
  153. /// Sets a new regular expression program to run. It is this method which
  154. /// performs any special compile-time search optimizations. Currently only
  155. /// two optimizations are in place - one which checks for backreferences
  156. /// (so that they can be lazily allocated) and another which attempts to
  157. /// find an prefix anchor string so that substantial amounts of input can
  158. /// potentially be skipped without running the actual program.
  159. /// </summary>
  160. /// <param name="instruction">Program instruction buffer</param>
  161. /// <param name="lenInstruction">Length of instruction buffer in use</param>
  162. void setInstructions(char[] instruction, int lenInstruction)
  163. {
  164. // Save reference to instruction array
  165. this.instruction = instruction;
  166. this.lenInstruction = lenInstruction;
  167. // Initialize other program-related variables
  168. this.flags = 0;
  169. this.prefix = null;
  170. // Try various compile-time optimizations if there's a program
  171. if (instruction != null && lenInstruction != 0)
  172. {
  173. // If the first node is a branch
  174. if (lenInstruction >= Regex.nodeSize && instruction[0 + Regex.offsetOpcode] == OpCode.Branch)
  175. {
  176. // to the end node
  177. int next = (short)instruction[0 + Regex.offsetNext];
  178. if (instruction[next + Regex.offsetOpcode] == OpCode.EndProgram && lenInstruction >= (Regex.nodeSize * 2))
  179. {
  180. char nextOp = instruction[Regex.nodeSize + Regex.offsetOpcode];
  181. // the branch starts with an atom
  182. if (nextOp == OpCode.Atom)
  183. {
  184. // then get that atom as an prefix because there's no other choice
  185. int lenAtom = instruction[Regex.nodeSize + Regex.offsetOpdata];
  186. this.prefix = new char[lenAtom];
  187. System.Array.Copy(instruction, Regex.nodeSize * 2, prefix, 0, lenAtom);
  188. }
  189. // the branch starts with a BOL
  190. else if (nextOp == OpCode.BeginOfLine)
  191. {
  192. // then set the flag indicating that BOL is present
  193. this.flags |= ProgramOptions.HasBeginOfLine;
  194. }
  195. }
  196. }
  197. // Check for backreferences
  198. for (int i = 0; i < lenInstruction; i += Regex.nodeSize)
  199. {
  200. switch (instruction[i + Regex.offsetOpcode])
  201. {
  202. case OpCode.AnyOf:
  203. i += (instruction[i + Regex.offsetOpdata] * 2);
  204. break;
  205. case OpCode.Atom:
  206. i += instruction[i + Regex.offsetOpdata];
  207. break;
  208. case OpCode.BackRef:
  209. flags |= ProgramOptions.HasBackrefrence;
  210. return;
  211. }
  212. }
  213. }
  214. }
  215. }
  216. }