PageRenderTime 45ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/IronPython_Main/Runtime/Tests/LinqDlrTests/testenv/perl/lib/unicode/mktables.pl

#
Perl | 447 lines | 337 code | 61 blank | 49 comment | 12 complexity | eb07017211fab8a5eb57bbd5f3835106 MD5 | raw file
Possible License(s): GPL-2.0, MPL-2.0-no-copyleft-exception, CPL-1.0, CC-BY-SA-3.0, BSD-3-Clause, ISC, AGPL-3.0, LGPL-2.1, Apache-2.0
  1. #!../../miniperl
  2. use bytes;
  3. $UnicodeData = "Unicode.301";
  4. $SyllableData = "syllables.txt";
  5. $PropData = "PropList.txt";
  6. # Note: we try to keep filenames unique within first 8 chars. Using
  7. # subdirectories for the following helps.
  8. mkdir "In", 0755;
  9. mkdir "Is", 0755;
  10. mkdir "To", 0755;
  11. @todo = (
  12. # typical
  13. # 005F: SPACING UNDERSCROE
  14. ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''],
  15. ['IsAlnum', '$cat =~ /^[LMN]/', ''],
  16. ['IsAlpha', '$cat =~ /^[LM]/', ''],
  17. # 0009: HORIZONTAL TABULATION
  18. # 000A: LINE FEED
  19. # 000B: VERTICAL TABULATION
  20. # 000C: FORM FEED
  21. # 000D: CARRIAGE RETURN
  22. # 0020: SPACE
  23. ['IsSpace', '$cat =~ /^Z/ ||
  24. $code =~ /^(0009|000A|000B|000C|000D)$/', ''],
  25. ['IsSpacePerl',
  26. '$cat =~ /^Z/ ||
  27. $code =~ /^(0009|000A|000C|000D)$/', ''],
  28. ['IsBlank', '$code =~ /^(0020|0009)$/ ||
  29. $cat =~ /^Z[^lp]$/', ''],
  30. ['IsDigit', '$cat =~ /^Nd$/', ''],
  31. ['IsUpper', '$cat =~ /^L[ut]$/', ''],
  32. ['IsLower', '$cat =~ /^Ll$/', ''],
  33. ['IsASCII', '$code le "007f"', ''],
  34. ['IsCntrl', '$cat =~ /^C/', ''],
  35. ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''],
  36. ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''],
  37. ['IsPunct', '$cat =~ /^P/', ''],
  38. # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
  39. ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
  40. ['ToUpper', '$up', '$up'],
  41. ['ToLower', '$down', '$down'],
  42. ['ToTitle', '$title', '$title'],
  43. ['ToDigit', '$dec ne ""', '$dec'],
  44. # Name
  45. ['Name', '$name', '$name'],
  46. # Category
  47. ['Category', '$cat', '$cat'],
  48. # Normative
  49. ['IsM', '$cat =~ /^M/', ''], # Mark
  50. ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
  51. ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
  52. ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
  53. ['IsN', '$cat =~ /^N/', ''], # Number
  54. ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
  55. ['IsNo', '$cat eq "No"', ''], # Number, Other
  56. ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
  57. ['IsZ', '$cat =~ /^Z/', ''], # Separator
  58. ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
  59. ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
  60. ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
  61. ['IsC', '$cat =~ /^C/', ''], # Crazy
  62. ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
  63. ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
  64. ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
  65. ['IsCf', '$cat eq "Cf"', ''], # Other, Format
  66. ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
  67. ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
  68. # Informative
  69. ['IsL', '$cat =~ /^L/', ''], # Letter
  70. ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
  71. ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
  72. ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
  73. ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
  74. ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
  75. ['IsP', '$cat =~ /^P/', ''], # Punctuation
  76. ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
  77. ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
  78. ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
  79. ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
  80. ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
  81. ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
  82. ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
  83. ['IsS', '$cat =~ /^S/', ''], # Symbol
  84. ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
  85. ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
  86. ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
  87. ['IsSo', '$cat eq "So"', ''], # Symbol, Other
  88. # Combining class
  89. ['CombiningClass', '$comb', '$comb'],
  90. # BIDIRECTIONAL PROPERTIES
  91. ['Bidirectional', '$bid', '$bid'],
  92. # Strong types:
  93. ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
  94. # syllabic, and logographic
  95. # characters (e.g., CJK
  96. # ideographs)
  97. ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
  98. # and punctuation specific to
  99. # those scripts
  100. ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
  101. ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
  102. ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
  103. ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
  104. ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
  105. ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
  106. ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
  107. ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
  108. # Weak types:
  109. ['IsBidiEN','$bid eq "EN"', ''], # European Number
  110. ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
  111. ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
  112. ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
  113. ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
  114. # Separators:
  115. ['IsBidiB', '$bid eq "B"', ''], # Block Separator
  116. ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
  117. # Neutrals:
  118. ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
  119. ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
  120. # characters: punctuation,
  121. # symbols
  122. # Decomposition
  123. ['Decomposition', '$decomp', '$decomp'],
  124. ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
  125. ['IsDecoCompat', '$decomp =~ /^</', ''],
  126. ['IsDCfont', '$decomp =~ /^<font>/', ''],
  127. ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
  128. ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
  129. ['IsDCmedial', '$decomp =~ /^<medial>/', ''],
  130. ['IsDCfinal', '$decomp =~ /^<final>/', ''],
  131. ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
  132. ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
  133. ['IsDCsuper', '$decomp =~ /^<super>/', ''],
  134. ['IsDCsub', '$decomp =~ /^<sub>/', ''],
  135. ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
  136. ['IsDCwide', '$decomp =~ /^<wide>/', ''],
  137. ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
  138. ['IsDCsmall', '$decomp =~ /^<small>/', ''],
  139. ['IsDCsquare', '$decomp =~ /^<square>/', ''],
  140. ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
  141. ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
  142. # Number
  143. ['Number', '$num ne ""', '$num'],
  144. # Mirrored
  145. ['IsMirrored', '$mir eq "Y"', ''],
  146. # Arabic
  147. ['ArabLink', '1', '$link'],
  148. ['ArabLnkGrp', '1', '$linkgroup'],
  149. # Jamo
  150. ['JamoShort', '1', '$short'],
  151. # Syllables
  152. syllable_defs(),
  153. # Line break properties - Normative
  154. ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
  155. ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
  156. ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
  157. ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
  158. ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
  159. ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
  160. ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
  161. ['IsLbrkSP','$brk eq "SP"', ''], # Space
  162. ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
  163. # Line break properties - Informative
  164. ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
  165. ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
  166. ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
  167. ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
  168. ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
  169. ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
  170. ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
  171. ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
  172. ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
  173. ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
  174. ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
  175. ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
  176. ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
  177. ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
  178. ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
  179. ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
  180. ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
  181. ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
  182. ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
  183. ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
  184. );
  185. # This is not written for speed...
  186. foreach $file (@todo) {
  187. my ($table, $wanted, $val) = @$file;
  188. next if @ARGV and not grep { $_ eq $table } @ARGV;
  189. print $table,"\n";
  190. if ($table =~ /^(Is|In|To)(.*)/) {
  191. open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
  192. }
  193. else {
  194. open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
  195. }
  196. print OUT <<EOH;
  197. # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
  198. # This file is built by $0 from e.g. $UnicodeData.
  199. # Any changes made here will be lost!
  200. EOH
  201. print OUT <<"END";
  202. return <<'END';
  203. END
  204. print OUT proplist($table, $wanted, $val);
  205. print OUT "END\n";
  206. close OUT;
  207. }
  208. # Must treat blocks specially.
  209. exit if @ARGV and not grep { $_ eq Block } @ARGV;
  210. print "Block\n";
  211. open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
  212. open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
  213. print OUT <<EOH;
  214. # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
  215. # This file is built by $0 from e.g. $UnicodeData.
  216. # Any changes made here will be lost!
  217. EOH
  218. print OUT <<"END";
  219. return <<'END';
  220. END
  221. while (<UD>) {
  222. next if /^#/;
  223. next if /^$/;
  224. chomp;
  225. ($code, $last, $name) = split(/; */);
  226. if ($name) {
  227. print OUT "$code $last $name\n";
  228. $name =~ s/\s+//g;
  229. open(BLOCK, ">In/$name.pl");
  230. print BLOCK <<EOH;
  231. # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
  232. # This file is built by $0 from e.g. $UnicodeData.
  233. # Any changes made here will be lost!
  234. EOH
  235. print BLOCK <<"END2";
  236. return <<'END';
  237. $code $last
  238. END
  239. END2
  240. close BLOCK;
  241. }
  242. }
  243. print OUT "END\n";
  244. close OUT;
  245. ##################################################
  246. sub proplist {
  247. my ($table, $wanted, $val) = @_;
  248. my @wanted;
  249. my $out;
  250. my $split;
  251. return listFromPropFile($wanted) if $val eq $PropData;
  252. if ($table =~ /^Arab/) {
  253. open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
  254. $split = '($code, $name, $link, $linkgroup) = split(/; */);';
  255. }
  256. elsif ($table =~ /^Jamo/) {
  257. open(UD, "Jamo.txt") or warn "Can't open $table: $!";
  258. $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
  259. }
  260. elsif ($table =~ /^IsSyl/) {
  261. open(UD, $SyllableData) or warn "Can't open $table: $!";
  262. $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
  263. }
  264. elsif ($table =~ /^IsLbrk/) {
  265. open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
  266. $split = '($code, $brk, $name) = split(/;/);';
  267. }
  268. else {
  269. open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
  270. $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
  271. $comment, $up, $down, $title) = split(/;/);';
  272. }
  273. if ($table =~ /^(?:To|Is)[A-Z]/) {
  274. eval <<"END";
  275. while (<UD>) {
  276. next if /^#/;
  277. next if /^\\s/;
  278. s/\\s+\$//;
  279. $split
  280. if ($wanted) {
  281. push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
  282. }
  283. }
  284. END
  285. die $@ if $@;
  286. while (@wanted) {
  287. $beg = shift @wanted;
  288. $last = $beg;
  289. while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
  290. (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
  291. $last = shift @wanted;
  292. }
  293. $out .= sprintf "%04x", $beg->[0];
  294. if ($beg->[2]) {
  295. $last = shift @wanted;
  296. }
  297. if ($beg == $last) {
  298. $out .= "\t";
  299. }
  300. else {
  301. $out .= sprintf "\t%04x", $last->[0];
  302. }
  303. $out .= sprintf "\t%04x", $beg->[1] if $val;
  304. $out .= "\n";
  305. }
  306. }
  307. else {
  308. eval <<"END";
  309. while (<UD>) {
  310. next if /^#/;
  311. next if /^\\s*\$/;
  312. chop;
  313. $split
  314. if ($wanted) {
  315. push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
  316. }
  317. }
  318. END
  319. die $@ if $@;
  320. while (@wanted) {
  321. $beg = shift @wanted;
  322. $last = $beg;
  323. while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
  324. ($wanted[0]->[1] eq $last->[1])) {
  325. $last = shift @wanted;
  326. }
  327. $out .= sprintf "%04x", $beg->[0];
  328. if ($beg->[2]) {
  329. $last = shift @wanted;
  330. }
  331. if ($beg == $last) {
  332. $out .= "\t";
  333. }
  334. else {
  335. $out .= sprintf "\t%04x", $last->[0];
  336. }
  337. $out .= sprintf "\t%s\n", $beg->[1];
  338. }
  339. }
  340. $out;
  341. }
  342. sub listFromPropFile {
  343. my ($wanted) = @_;
  344. my $out;
  345. open (UD, $PropData) or die "Can't open $PropData: $!\n";
  346. local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
  347. <UD>;
  348. while (<UD>) {
  349. chomp;
  350. if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
  351. s/\(\d+ chars\)//g;
  352. s/^\s+//mg;
  353. s/\s+$//mg;
  354. s/\.\./\t/g;
  355. $out = lc $_;
  356. last;
  357. }
  358. }
  359. close (UD);
  360. "$out\n";
  361. }
  362. sub syllable_defs {
  363. my @defs;
  364. my %seen;
  365. open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
  366. while (<SD>) {
  367. next if /^\s*(#|$)/;
  368. s/\s+$//;
  369. ($code, $name, $syl) = split /; */;
  370. next unless $syl;
  371. push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
  372. unless $seen{$syl}++;
  373. }
  374. close (SD);
  375. return (@defs);
  376. }
  377. # eof