PageRenderTime 61ms CodeModel.GetById 19ms app.highlight 38ms RepoModel.GetById 1ms app.codeStats 0ms

/IronPython_Main/Runtime/Tests/LinqDlrTests/testenv/perl/lib/unicode/mktables.pl

#
Perl | 447 lines | 337 code | 61 blank | 49 comment | 12 complexity | eb07017211fab8a5eb57bbd5f3835106 MD5 | raw file
  1#!../../miniperl
  2
  3use bytes;
  4
  5$UnicodeData = "Unicode.301";
  6$SyllableData = "syllables.txt";
  7$PropData = "PropList.txt";
  8
  9
 10# Note: we try to keep filenames unique within first 8 chars.  Using
 11# subdirectories for the following helps.
 12mkdir "In", 0755;
 13mkdir "Is", 0755;
 14mkdir "To", 0755;
 15
 16@todo = (
 17# typical
 18
 19    # 005F: SPACING UNDERSCROE
 20    ['IsWord',   '$cat =~ /^[LMN]/ or $code eq "005F"',	''],
 21    ['IsAlnum',  '$cat =~ /^[LMN]/',	''],
 22    ['IsAlpha',  '$cat =~ /^[LM]/',	''],
 23    # 0009: HORIZONTAL TABULATION
 24    # 000A: LINE FEED
 25    # 000B: VERTICAL TABULATION
 26    # 000C: FORM FEED
 27    # 000D: CARRIAGE RETURN
 28    # 0020: SPACE
 29    ['IsSpace',  '$cat  =~ /^Z/ ||
 30                  $code =~ /^(0009|000A|000B|000C|000D)$/',	''],
 31    ['IsSpacePerl',
 32                 '$cat  =~ /^Z/ ||
 33                  $code =~ /^(0009|000A|000C|000D)$/',		''],
 34    ['IsBlank',  '$code =~ /^(0020|0009)$/ ||
 35		  $cat  =~ /^Z[^lp]$/',	''],
 36    ['IsDigit',  '$cat =~ /^Nd$/',	''],
 37    ['IsUpper',  '$cat =~ /^L[ut]$/',	''],
 38    ['IsLower',  '$cat =~ /^Ll$/',	''],
 39    ['IsASCII',  '$code le "007f"',	''],
 40    ['IsCntrl',  '$cat =~ /^C/',	''],
 41    ['IsGraph',  '$cat =~ /^([LMNPS]|Co)/',	''],
 42    ['IsPrint',  '$cat =~ /^([LMNPS]|Co|Zs)/',	''],
 43    ['IsPunct',  '$cat =~ /^P/',	''],
 44    # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
 45    ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/',	''],
 46    ['ToUpper',  '$up',			'$up'],
 47    ['ToLower',  '$down',		'$down'],
 48    ['ToTitle',  '$title',		'$title'],
 49    ['ToDigit',  '$dec ne ""',		'$dec'],
 50
 51# Name
 52
 53    ['Name',	'$name',		'$name'],
 54
 55# Category
 56
 57    ['Category', '$cat',		'$cat'],
 58
 59# Normative
 60
 61    ['IsM',	'$cat =~ /^M/',		''],	# Mark
 62    ['IsMn',	'$cat eq "Mn"',		''],	# Mark, Non-Spacing 
 63    ['IsMc',	'$cat eq "Mc"',		''],	# Mark, Combining
 64    ['IsMe',	'$cat eq "Me"',		''],    # Mark, Enclosing
 65
 66    ['IsN',	'$cat =~ /^N/',		''],	# Number
 67    ['IsNd',	'$cat eq "Nd"',		''],	# Number, Decimal Digit
 68    ['IsNo',	'$cat eq "No"',		''],	# Number, Other
 69    ['IsNl',	'$cat eq "Nl"',		''],    # Number, Letter
 70
 71    ['IsZ',	'$cat =~ /^Z/',		''],	# Separator
 72    ['IsZs',	'$cat eq "Zs"',		''],	# Separator, Space
 73    ['IsZl',	'$cat eq "Zl"',		''],	# Separator, Line
 74    ['IsZp',	'$cat eq "Zp"',		''],	# Separator, Paragraph
 75
 76    ['IsC',	'$cat =~ /^C/',		''],	# Crazy
 77    ['IsCc',	'$cat eq "Cc"',		''],	# Other, Control or Format
 78    ['IsCo',	'$cat eq "Co"',		''],	# Other, Private Use
 79    ['IsCn',	'$cat eq "Cn"',		''],	# Other, Not Assigned
 80    ['IsCf',	'$cat eq "Cf"',		''],    # Other, Format
 81    ['IsCs',	'$cat eq "Cs"',		''],    # Other, Surrogate
 82    ['IsCn',	'Unassigned Code Value',$PropData],	# Other, Not Assigned
 83 
 84# Informative
 85
 86    ['IsL',	'$cat =~ /^L/',		''],	# Letter
 87    ['IsLu',	'$cat eq "Lu"',		''],	# Letter, Uppercase
 88    ['IsLl',	'$cat eq "Ll"',		''],	# Letter, Lowercase
 89    ['IsLt',	'$cat eq "Lt"',		''],	# Letter, Titlecase 
 90    ['IsLm',	'$cat eq "Lm"',		''],	# Letter, Modifier
 91    ['IsLo',	'$cat eq "Lo"',		''],	# Letter, Other 
 92
 93    ['IsP',	'$cat =~ /^P/',		''],	# Punctuation
 94    ['IsPd',	'$cat eq "Pd"',		''],	# Punctuation, Dash
 95    ['IsPs',	'$cat eq "Ps"',		''],	# Punctuation, Open
 96    ['IsPe',	'$cat eq "Pe"',		''],	# Punctuation, Close
 97    ['IsPo',	'$cat eq "Po"',		''],	# Punctuation, Other
 98    ['IsPc',	'$cat eq "Pc"',		''],	# Punctuation, Connector
 99    ['IsPi',	'$cat eq "Pi"',		''],	# Punctuation, Initial quote
100    ['IsPf',	'$cat eq "Pf"',		''],	# Punctuation, Final quote
101
102    ['IsS',	'$cat =~ /^S/',		''],	# Symbol
103    ['IsSm',	'$cat eq "Sm"',		''],	# Symbol, Math
104    ['IsSk',	'$cat eq "Sk"',		''],	# Symbol, Modifier
105    ['IsSc',	'$cat eq "Sc"',		''],	# Symbol, Currency
106    ['IsSo',	'$cat eq "So"',		''],	# Symbol, Other
107
108# Combining class
109    ['CombiningClass', '$comb',		'$comb'],
110
111# BIDIRECTIONAL PROPERTIES
112 
113    ['Bidirectional', '$bid',		'$bid'],
114
115# Strong types:
116
117    ['IsBidiL',	'$bid eq "L"',		''],	# Left-Right; Most alphabetic,
118						# syllabic, and logographic
119						# characters (e.g., CJK
120						# ideographs)
121    ['IsBidiR',	'$bid eq "R"',		''],	# Right-Left; Arabic, Hebrew,
122						# and punctuation specific to
123						# those scripts
124
125    ['IsBidiLRE', '$bid eq "LRE"',       ''],    # Left-to-Right Embedding
126    ['IsBidiLRO', '$bid eq "LRO"',       ''],    # Left-to-Right Override
127    ['IsBidiAL', '$bid eq "AL"',         ''],    # Right-to-Left Arabic
128    ['IsBidiRLE', '$bid eq "RLE"',       ''],    # Right-to-Left Embedding
129    ['IsBidiRLO', '$bid eq "RLO"',       ''],    # Right-to-Left Override
130    ['IsBidiPDF', '$bid eq "PDF"',       ''],    # Pop Directional Format
131    ['IsBidiNSM', '$bid eq "NSM"',       ''],    # Non-Spacing Mark
132    ['IsBidiBN', '$bid eq "BN"',         ''],    # Boundary Neutral
133
134# Weak types:
135
136    ['IsBidiEN','$bid eq "EN"',		''],	# European Number
137    ['IsBidiES','$bid eq "ES"',		''],	# European Number Separator
138    ['IsBidiET','$bid eq "ET"',		''],	# European Number Terminator
139    ['IsBidiAN','$bid eq "AN"',		''],	# Arabic Number
140    ['IsBidiCS','$bid eq "CS"',		''],	# Common Number Separator
141
142# Separators:
143
144    ['IsBidiB',	'$bid eq "B"',		''],	# Block Separator
145    ['IsBidiS',	'$bid eq "S"',		''],	# Segment Separator
146
147# Neutrals:
148
149    ['IsBidiWS','$bid eq "WS"',		''],	# Whitespace
150    ['IsBidiON','$bid eq "ON"',		''],	# Other Neutrals ; All other
151						# characters: punctuation,
152						# symbols
153
154# Decomposition
155
156    ['Decomposition',	'$decomp',	'$decomp'],
157    ['IsDecoCanon',	'$decomp && $decomp !~ /^</',	''],
158    ['IsDecoCompat',	'$decomp =~ /^</',		''],
159    ['IsDCfont',	'$decomp =~ /^<font>/',		''],
160    ['IsDCnoBreak',	'$decomp =~ /^<noBreak>/',	''],
161    ['IsDCinitial',	'$decomp =~ /^<initial>/',	''],
162    ['IsDCmedial',	'$decomp =~ /^<medial>/',	''],
163    ['IsDCfinal',	'$decomp =~ /^<final>/',	''],
164    ['IsDCisolated',	'$decomp =~ /^<isolated>/',	''],
165    ['IsDCcircle',	'$decomp =~ /^<circle>/',	''],
166    ['IsDCsuper',	'$decomp =~ /^<super>/',	''],
167    ['IsDCsub',		'$decomp =~ /^<sub>/',		''],
168    ['IsDCvertical',	'$decomp =~ /^<vertical>/',	''],
169    ['IsDCwide',	'$decomp =~ /^<wide>/',		''],
170    ['IsDCnarrow',	'$decomp =~ /^<narrow>/',	''],
171    ['IsDCsmall',	'$decomp =~ /^<small>/',	''],
172    ['IsDCsquare',	'$decomp =~ /^<square>/',	''],
173    ['IsDCfraction',	'$decomp =~ /^<fraction>/',	''],
174    ['IsDCcompat',	'$decomp =~ /^<compat>/',	''],
175
176# Number
177
178    ['Number', 	'$num ne ""',		'$num'],
179
180# Mirrored
181
182    ['IsMirrored', '$mir eq "Y"',	''],
183
184# Arabic
185
186    ['ArabLink', 	'1',		'$link'],
187    ['ArabLnkGrp', 	'1',		'$linkgroup'],
188
189# Jamo
190
191    ['JamoShort',	'1',		'$short'],
192
193# Syllables
194
195    syllable_defs(),
196
197# Line break properties - Normative
198
199    ['IsLbrkBK','$brk eq "BK"',		''],	# Mandatory Break
200    ['IsLbrkCR','$brk eq "CR"',		''],	# Carriage Return
201    ['IsLbrkLF','$brk eq "LF"',		''],	# Line Feed
202    ['IsLbrkCM','$brk eq "CM"',		''],	# Attached Characters and Combining Marks
203    ['IsLbrkSG','$brk eq "SG"',		''],	# Surrogates
204    ['IsLbrkGL','$brk eq "GL"',		''],	# Non-breaking (Glue)
205    ['IsLbrkCB','$brk eq "CB"',		''],	# Contingent Break Opportunity
206    ['IsLbrkSP','$brk eq "SP"',		''],	# Space
207    ['IsLbrkZW','$brk eq "ZW"',		''],	# Zero Width Space
208
209# Line break properties - Informative
210    ['IsLbrkXX','$brk eq "XX"',		''],	# Unknown
211    ['IsLbrkOP','$brk eq "OP"',		''],	# Opening Punctuation
212    ['IsLbrkCL','$brk eq "CL"',		''],	# Closing Punctuation
213    ['IsLbrkQU','$brk eq "QU"',		''],	# Ambiguous Quotation
214    ['IsLbrkNS','$brk eq "NS"',		''],	# Non Starter
215    ['IsLbrkEX','$brk eq "EX"',		''],	# Exclamation/Interrogation
216    ['IsLbrkSY','$brk eq "SY"',		''],	# Symbols Allowing Breaks
217    ['IsLbrkIS','$brk eq "IS"',		''],	# Infix Separator (Numeric)
218    ['IsLbrkPR','$brk eq "PR"',		''],	# Prefix (Numeric)
219    ['IsLbrkPO','$brk eq "PO"',		''],	# Postfix (Numeric)
220    ['IsLbrkNU','$brk eq "NU"',		''],	# Numeric
221    ['IsLbrkAL','$brk eq "AL"',		''],	# Ordinary Alphabetic and Symbol Characters
222    ['IsLbrkID','$brk eq "ID"',		''],	# Ideographic
223    ['IsLbrkIN','$brk eq "IN"',		''],	# Inseparable
224    ['IsLbrkHY','$brk eq "HY"',		''],	# Hyphen
225    ['IsLbrkBB','$brk eq "BB"',		''],	# Break Opportunity Before
226    ['IsLbrkBA','$brk eq "BA"',		''],	# Break Opportunity After
227    ['IsLbrkSA','$brk eq "SA"',		''],	# Complex Context (South East Asian)
228    ['IsLbrkAI','$brk eq "AI"',		''],	# Ambiguous (Alphabetic or Ideographic)
229    ['IsLbrkB2','$brk eq "B2"',		''],	# Break Opportunity Before and After
230);
231
232# This is not written for speed...
233
234foreach $file (@todo) {
235    my ($table, $wanted, $val) = @$file;
236    next if @ARGV and not grep { $_ eq $table } @ARGV;
237    print $table,"\n";
238    if ($table =~ /^(Is|In|To)(.*)/) {
239	open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
240    }
241    else {
242	open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
243    }
244    print OUT <<EOH;
245# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
246# This file is built by $0 from e.g. $UnicodeData.
247# Any changes made here will be lost!
248EOH
249    print OUT <<"END";
250return <<'END';
251END
252    print OUT proplist($table, $wanted, $val);
253    print OUT "END\n";
254    close OUT;
255}
256
257# Must treat blocks specially.
258
259exit if @ARGV and not grep { $_ eq Block } @ARGV;
260print "Block\n";
261open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
262open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
263print OUT <<EOH;
264# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
265# This file is built by $0 from e.g. $UnicodeData.
266# Any changes made here will be lost!
267EOH
268print OUT <<"END";
269return <<'END';
270END
271
272while (<UD>) {
273    next if /^#/;
274    next if /^$/;
275    chomp;
276    ($code, $last, $name) = split(/; */);
277    if ($name) {
278	print OUT "$code	$last	$name\n";
279	$name =~ s/\s+//g;
280	open(BLOCK, ">In/$name.pl");
281	print BLOCK <<EOH;
282# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
283# This file is built by $0 from e.g. $UnicodeData.
284# Any changes made here will be lost!
285EOH
286	print BLOCK <<"END2";
287return <<'END';
288$code	$last
289END
290END2
291	close BLOCK;
292    }
293}
294
295print OUT "END\n";
296close OUT;
297
298##################################################
299
300sub proplist {
301    my ($table, $wanted, $val) = @_;
302    my @wanted;
303    my $out;
304    my $split;
305
306    return listFromPropFile($wanted) if $val eq $PropData;
307
308    if ($table =~ /^Arab/) {
309	open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
310
311	$split = '($code, $name, $link, $linkgroup) = split(/; */);';
312    }
313    elsif ($table =~ /^Jamo/) {
314	open(UD, "Jamo.txt") or warn "Can't open $table: $!";
315
316	$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
317    }
318    elsif ($table =~ /^IsSyl/) {
319	open(UD, $SyllableData) or warn "Can't open $table: $!";
320
321	$split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
322    }
323    elsif ($table =~ /^IsLbrk/) {
324	open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
325
326	$split = '($code, $brk, $name) = split(/;/);';
327    }
328    else {
329	open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
330
331	$split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
332		$comment, $up, $down, $title) = split(/;/);';
333    }
334
335    if ($table =~ /^(?:To|Is)[A-Z]/) {
336	eval <<"END";
337	    while (<UD>) {
338		next if /^#/;
339		next if /^\\s/;
340		s/\\s+\$//;
341		$split
342		if ($wanted) {
343		    push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
344		}
345	    }
346END
347	die $@ if $@;
348
349	while (@wanted) {
350	    $beg = shift @wanted;
351	    $last = $beg;
352	    while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
353		(not $val or $wanted[0]->[1] == $last->[1] + 1)) {
354		    $last = shift @wanted;
355	    }
356	    $out .= sprintf "%04x", $beg->[0];
357	    if ($beg->[2]) {
358		$last = shift @wanted;
359	    }
360	    if ($beg == $last) {
361		$out .= "\t";
362	    }
363	    else {
364		$out .= sprintf "\t%04x", $last->[0];
365	    }
366	    $out .= sprintf "\t%04x", $beg->[1] if $val;
367	    $out .= "\n";
368	}
369    }
370    else {
371	eval <<"END";
372	    while (<UD>) {
373		next if /^#/;
374		next if /^\\s*\$/;
375		chop;
376		$split
377		if ($wanted) {
378		    push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
379		}
380	    }
381END
382	die $@ if $@;
383
384	while (@wanted) {
385	    $beg = shift @wanted;
386	    $last = $beg;
387	    while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
388		($wanted[0]->[1] eq $last->[1])) {
389		    $last = shift @wanted;
390	    }
391	    $out .= sprintf "%04x", $beg->[0];
392	    if ($beg->[2]) {
393		$last = shift @wanted;
394	    }
395	    if ($beg == $last) {
396		$out .= "\t";
397	    }
398	    else {
399		$out .= sprintf "\t%04x", $last->[0];
400	    }
401	    $out .= sprintf "\t%s\n", $beg->[1];
402	}
403    }
404    $out;
405}
406
407sub listFromPropFile {
408    my ($wanted) = @_;
409    my $out;
410
411    open (UD, $PropData) or die "Can't open $PropData: $!\n";
412    local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:";   # not 42?
413
414    <UD>;
415    while (<UD>) {
416        chomp;
417        if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
418            s/\(\d+ chars\)//g;
419            s/^\s+//mg;
420            s/\s+$//mg;
421            s/\.\./\t/g;
422	    $out = lc $_;
423	    last;
424        }
425    }
426    close (UD);
427    "$out\n";
428}
429
430sub syllable_defs {
431    my @defs;
432    my %seen;
433
434    open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
435    while (<SD>) {
436        next if /^\s*(#|$)/;
437        s/\s+$//;
438        ($code, $name, $syl) = split /; */;
439        next unless $syl;
440        push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
441                                                     unless $seen{$syl}++;
442    }
443    close (SD);
444    return (@defs);
445}
446
447# eof