PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/external/icu4c/i18n/regexcst.pl

https://gitlab.com/brian0218/rk3188_r-box_android4.2.2_sdk
Perl | 328 lines | 186 code | 32 blank | 110 comment | 46 complexity | 60200425e9d47a81bdefcffbd6bb47fa MD5 | raw file
  1. #!/usr/bin/perl
  2. # ********************************************************************
  3. # * COPYRIGHT:
  4. # * Copyright (c) 2002-2007, International Business Machines Corporation and
  5. # * others. All Rights Reserved.
  6. # ********************************************************************
  7. #
  8. # regexcst.pl
  9. # Compile the regular expression paser state table data into initialized C data.
  10. # Usage:
  11. # cd icu/source/i18n
  12. # perl regexcst.pl < regexcst.txt > regexcst.h
  13. #
  14. # The output file, regexcst.h, is included by some of the .cpp regex
  15. # implementation files. This perl script is NOT run as part
  16. # of a normal ICU build. It is run by hand when needed, and the
  17. # regexcst.h generated file is put back into cvs.
  18. #
  19. # See regexcst.txt for a description of the input format for this script.
  20. #
  21. # This script is derived from rbbicst.pl, which peforms the same function
  22. # for the Rule Based Break Iterator Rule Parser. Perhaps they could be
  23. # merged?
  24. #
  25. $num_states = 1; # Always the state number for the line being compiled.
  26. $line_num = 0; # The line number in the input file.
  27. $states{"pop"} = 255; # Add the "pop" to the list of defined state names.
  28. # This prevents any state from being labelled with "pop",
  29. # and resolves references to "pop" in the next state field.
  30. line_loop: while (<>) {
  31. chomp();
  32. $line = $_;
  33. @fields = split();
  34. $line_num++;
  35. # Remove # comments, which are any fields beginning with a #, plus all
  36. # that follow on the line.
  37. for ($i=0; $i<@fields; $i++) {
  38. if ($fields[$i] =~ /^#/) {
  39. @fields = @fields[0 .. $i-1];
  40. last;
  41. }
  42. }
  43. # ignore blank lines, and those with no fields left after stripping comments..
  44. if (@fields == 0) {
  45. next;
  46. }
  47. #
  48. # State Label: handling.
  49. # Does the first token end with a ":"? If so, it's the name of a state.
  50. # Put in a hash, together with the current state number,
  51. # so that we can later look up the number from the name.
  52. #
  53. if (@fields[0] =~ /.*:$/) {
  54. $state_name = @fields[0];
  55. $state_name =~ s/://; # strip off the colon from the state name.
  56. if ($states{$state_name} != 0) {
  57. print " rbbicst: at line $line-num duplicate definition of state $state_name\n";
  58. }
  59. $states{$state_name} = $num_states;
  60. $stateNames[$num_states] = $state_name;
  61. # if the label was the only thing on this line, go on to the next line,
  62. # otherwise assume that a state definition is on the same line and fall through.
  63. if (@fields == 1) {
  64. next line_loop;
  65. }
  66. shift @fields; # shift off label field in preparation
  67. # for handling the rest of the line.
  68. }
  69. #
  70. # State Transition line.
  71. # syntax is this,
  72. # character [n] target-state [^push-state] [function-name]
  73. # where
  74. # [something] is an optional something
  75. # character is either a single quoted character e.g. '['
  76. # or a name of a character class, e.g. white_space
  77. #
  78. $state_line_num[$num_states] = $line_num; # remember line number with each state
  79. # so we can make better error messages later.
  80. #
  81. # First field, character class or literal character for this transition.
  82. #
  83. if ($fields[0] =~ /^'.'$/) {
  84. # We've got a quoted literal character.
  85. $state_literal_chars[$num_states] = $fields[0];
  86. $state_literal_chars[$num_states] =~ s/'//g;
  87. } else {
  88. # We've got the name of a character class.
  89. $state_char_class[$num_states] = $fields[0];
  90. if ($fields[0] =~ /[\W]/) {
  91. print " rbbicsts: at line $line_num, bad character literal or character class name.\n";
  92. print " scanning $fields[0]\n";
  93. exit(-1);
  94. }
  95. }
  96. shift @fields;
  97. #
  98. # do the 'n' flag
  99. #
  100. $state_flag[$num_states] = "FALSE";
  101. if ($fields[0] eq "n") {
  102. $state_flag[$num_states] = "TRUE";
  103. shift @fields;
  104. }
  105. #
  106. # do the destination state.
  107. #
  108. $state_dest_state[$num_states] = $fields[0];
  109. if ($fields[0] eq "") {
  110. print " rbbicsts: at line $line_num, destination state missing.\n";
  111. exit(-1);
  112. }
  113. shift @fields;
  114. #
  115. # do the push state, if present.
  116. #
  117. if ($fields[0] =~ /^\^/) {
  118. $fields[0] =~ s/^\^//;
  119. $state_push_state[$num_states] = $fields[0];
  120. if ($fields[0] eq "" ) {
  121. print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n";
  122. exit(-1);
  123. }
  124. shift @fields;
  125. }
  126. #
  127. # Lastly, do the optional action name.
  128. #
  129. if ($fields[0] ne "") {
  130. $state_func_name[$num_states] = $fields[0];
  131. shift @fields;
  132. }
  133. #
  134. # There should be no fields left on the line at this point.
  135. #
  136. if (@fields > 0) {
  137. print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n";
  138. print " scanning $fields[0]\n";
  139. }
  140. $num_states++;
  141. }
  142. #
  143. # We've read in the whole file, now go back and output the
  144. # C source code for the state transition table.
  145. #
  146. # We read all states first, before writing anything, so that the state numbers
  147. # for the destination states are all available to be written.
  148. #
  149. #
  150. # Make hashes for the names of the character classes and
  151. # for the names of the actions that appeared.
  152. #
  153. for ($state=1; $state < $num_states; $state++) {
  154. if ($state_char_class[$state] ne "") {
  155. if ($charClasses{$state_char_class[$state]} == 0) {
  156. $charClasses{$state_char_class[$state]} = 1;
  157. }
  158. }
  159. if ($state_func_name[$state] eq "") {
  160. $state_func_name[$state] = "doNOP";
  161. }
  162. if ($actions{$state_action_name[$state]} == 0) {
  163. $actions{$state_func_name[$state]} = 1;
  164. }
  165. }
  166. #
  167. # Check that all of the destination states have been defined
  168. #
  169. #
  170. $states{"exit"} = 0; # Predefined state name, terminates state machine.
  171. for ($state=1; $state<$num_states; $state++) {
  172. if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
  173. print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
  174. $errors++;
  175. }
  176. if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
  177. print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
  178. $errors++;
  179. }
  180. }
  181. die if ($errors>0);
  182. print "//---------------------------------------------------------------------------------\n";
  183. print "//\n";
  184. print "// Generated Header File. Do not edit by hand.\n";
  185. print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n";
  186. print "// It is generated by the Perl script \"regexcst.pl\" from\n";
  187. print "// the rule parser state definitions file \"regexcst.txt\".\n";
  188. print "//\n";
  189. print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
  190. print "// and others. All rights reserved. \n";
  191. print "//\n";
  192. print "//---------------------------------------------------------------------------------\n";
  193. print "#ifndef RBBIRPT_H\n";
  194. print "#define RBBIRPT_H\n";
  195. print "\n";
  196. print "U_NAMESPACE_BEGIN\n";
  197. #
  198. # Emit the constants for indicies of Unicode Sets
  199. # Define one constant for each of the character classes encountered.
  200. # At the same time, store the index corresponding to the set name back into hash.
  201. #
  202. print "//\n";
  203. print "// Character classes for regex pattern scanning.\n";
  204. print "//\n";
  205. $i = 128; # State Table values for Unicode char sets range from 128-250.
  206. # Sets "default", "quoted", etc. get special handling.
  207. # They have no corresponding UnicodeSet object in the state machine,
  208. # but are handled by special case code. So we emit no reference
  209. # to a UnicodeSet object to them here.
  210. foreach $setName (keys %charClasses) {
  211. if ($setName eq "default") {
  212. $charClasses{$setName} = 255;}
  213. elsif ($setName eq "quoted") {
  214. $charClasses{$setName} = 254;}
  215. elsif ($setName eq "eof") {
  216. $charClasses{$setName} = 253;}
  217. else {
  218. # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
  219. print " static const uint8_t kRuleSet_$setName = $i;\n";
  220. $charClasses{$setName} = $i;
  221. $i++;
  222. }
  223. }
  224. print "\n\n";
  225. #
  226. # Emit the enum for the actions to be performed.
  227. #
  228. print "enum Regex_PatternParseAction {\n";
  229. foreach $act (keys %actions) {
  230. print " $act,\n";
  231. }
  232. print " rbbiLastAction};\n\n";
  233. #
  234. # Emit the struct definition for transtion table elements.
  235. #
  236. print "//-------------------------------------------------------------------------------\n";
  237. print "//\n";
  238. print "// RegexTableEl represents the structure of a row in the transition table\n";
  239. print "// for the pattern parser state machine.\n";
  240. print "//-------------------------------------------------------------------------------\n";
  241. print "struct RegexTableEl {\n";
  242. print " Regex_PatternParseAction fAction;\n";
  243. print " uint8_t fCharClass; // 0-127: an individual ASCII character\n";
  244. print " // 128-255: character class index\n";
  245. print " uint8_t fNextState; // 0-250: normal next-state numbers\n";
  246. print " // 255: pop next-state from stack.\n";
  247. print " uint8_t fPushState;\n";
  248. print " UBool fNextChar;\n";
  249. print "};\n\n";
  250. #
  251. # emit the state transition table
  252. #
  253. print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";
  254. print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1.
  255. for ($state=1; $state < $num_states; $state++) {
  256. print " , {$state_func_name[$state],";
  257. if ($state_literal_chars[$state] ne "") {
  258. $c = $state_literal_chars[$state];
  259. printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.
  260. }else {
  261. print " $charClasses{$state_char_class[$state]},";
  262. }
  263. print " $states{$state_dest_state[$state]},";
  264. # The push-state field is optional. If omitted, fill field with a zero, which flags
  265. # the state machine that there is no push state.
  266. if ($state_push_state[$state] eq "") {
  267. print "0, ";
  268. } else {
  269. print " $states{$state_push_state[$state]},";
  270. }
  271. print " $state_flag[$state]} ";
  272. # Put out a C++ comment showing the number (index) of this state row,
  273. # and, if this is the first row of the table for this state, the state name.
  274. print " // $state ";
  275. if ($stateNames[$state] ne "") {
  276. print " $stateNames[$state]";
  277. }
  278. print "\n";
  279. };
  280. print " };\n";
  281. #
  282. # emit a mapping array from state numbers to state names.
  283. #
  284. # This array is used for producing debugging output from the pattern parser.
  285. #
  286. print "static const char * const RegexStateNames[] = {";
  287. for ($state=0; $state<$num_states; $state++) {
  288. if ($stateNames[$state] ne "") {
  289. print " \"$stateNames[$state]\",\n";
  290. } else {
  291. print " 0,\n";
  292. }
  293. }
  294. print " 0};\n\n";
  295. print "U_NAMESPACE_END\n";
  296. print "#endif\n";