perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/perl -w
	2	use strict;
	3	use Carp;
	4
	5	die "$0: Please run me as ./mktables to avoid unnecessary differences\n"
	6	unless $0 eq "./mktables";
	7
	8	##
	9	## mktables -- create the runtime Perl Unicode files (lib/unicore/*/.pl)
	10	## from the Unicode database files (lib/unicore/*.txt).
	11	##
	12
	13	mkdir("lib", 0755);
	14	mkdir("To", 0755);
	15
	16	##
	17	## Process any args.
	18	##
	19	my $Verbose = 0;
	20	my $MakeTestScript = 0;
	21
	22	while (@ARGV)
	23	{
	24	my $arg = shift @ARGV;
	25	if ($arg eq '-v') {
	26	$Verbose = 1;
	27	} elsif ($arg eq '-q') {
	28	$Verbose = 0;
	29	} elsif ($arg eq '-maketest') {
	30	$MakeTestScript = 1;
	31	} else {
	32	die "usage: $0 [-v\|-q] [-maketest]";
	33	}
	34	}
	35
	36	my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
	37
	38	my $HEADER=<<"EOF";
	39	# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
	40	# This file is built by $0 from e.g. UnicodeData.txt.
	41	# Any changes made here will be lost!
	42
	43	EOF
	44
	45
	46	##
	47	## Given a filename and a reference to an array of lines,
	48	## write the lines to the file only if the contents have not changed.
	49	##
	50	sub WriteIfChanged($\@)
	51	{
	52	my $file = shift;
	53	my $lines = shift;
	54
	55	my $TextToWrite = join '', @$lines;
	56	if (open IN, $file) {
	57	local($/) = undef;
	58	my $PreviousText = <IN>;
	59	close IN;
	60	if ($PreviousText eq $TextToWrite) {
	61	print "$file unchanged.\n" if $Verbose;
	62	return;
	63	}
	64	}
	65	if (not open OUT, ">$file") {
	66	die "$0: can't open $file for output: $!\n";
	67	}
	68	print "$file written.\n" if $Verbose;
	69
	70	print OUT $TextToWrite;
	71	close OUT;
	72	}
	73
	74	##
	75	## The main datastructure (a "Table") represents a set of code points that
	76	## are part of a particular quality (that are part of \pL, \p{InGreek},
	77	## etc.). They are kept as ranges of code points (starting and ending of
	78	## each range).
	79	##
	80	## For example, a range ASCII LETTERS would be represented as:
	81	## [ [ 0x41 => 0x5A, 'UPPER' ],
	82	## [ 0x61 => 0x7A, 'LOWER, ] ]
	83	##
	84	sub RANGE_START() { 0 } ## index into range element
	85	sub RANGE_END() { 1 } ## index into range element
	86	sub RANGE_NAME() { 2 } ## index into range element
	87
	88	## Conceptually, these should really be folded into the 'Table' objects
	89	my %TableInfo;
	90	my %TableDesc;
	91	my %FuzzyNames;
	92	my %AliasInfo;
	93	my %CanonicalToOrig;
	94
	95	##
	96	## Turn something like
	97	## OLD-ITALIC
	98	## into
	99	## OldItalic
	100	##
	101	sub CanonicalName($)
	102	{
	103	my $orig = shift;
	104	my $name = lc $orig;
	105	$name =~ s/(?<![a-z])(\w)/\u$1/g;
	106	$name =~ s/[-_\s]+//g;
	107
	108	$CanonicalToOrig{$name} = $orig if not $CanonicalToOrig{$name};
	109	return $name;
	110	}
	111
	112	##
	113	## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
	114	##
	115	## Called like:
	116	## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1);
	117	##
	118	## Normally, these parameters are set when the Table is created (when the
	119	## Table->New constructor is called), but there are times when it needs to
	120	## be done after-the-fact...)
	121	##
	122	sub New_Prop($$$@)
	123	{
	124	my $Type = shift; ## "Is" or "In";
	125	my $Name = shift;
	126	my $Table = shift;
	127
	128	## remaining args are optional key/val
	129	my %Args = @_;
	130
	131	my $Fuzzy = delete $Args{Fuzzy};
	132	my $Desc = delete $Args{Desc}; # description
	133
	134	$Name = CanonicalName($Name) if $Fuzzy;
	135
	136	## sanity check a few args
	137	if (%Args or ($Type ne 'Is' and $Type ne 'In') or not ref $Table) {
	138	confess "$0: bad args to New_Prop"
	139	}
	140
	141	if (not $TableInfo{$Type}->{$Name})
	142	{
	143	$TableInfo{$Type}->{$Name} = $Table;
	144	$TableDesc{$Type}->{$Name} = $Desc;
	145	if ($Fuzzy) {
	146	$FuzzyNames{$Type}->{$Name} = $Name;
	147	}
	148	}
	149	}
	150
	151
	152	##
	153	## Creates a new Table object.
	154	##
	155	## Args are key/value pairs:
	156	## In => Name -- Name of "In" property to be associated with
	157	## Is => Name -- Name of "Is" property to be associated with
	158	## Fuzzy => Boolean -- True if name can be accessed "fuzzily"
	159	## Desc => String -- Description of the property
	160	##
	161	## No args are required.
	162	##
	163	sub Table::New
	164	{
	165	my $class = shift;
	166	my %Args = @_;
	167
	168	my $Table = bless [], $class;
	169
	170	my $Fuzzy = delete $Args{Fuzzy};
	171	my $Desc = delete $Args{Desc};
	172
	173	for my $Type ('Is', 'In')
	174	{
	175	if (my $Name = delete $Args{$Type}) {
	176	New_Prop($Type => $Name, $Table, Desc => $Desc, Fuzzy => $Fuzzy);
	177	}
	178	}
	179
	180	## shouldn't have any left over
	181	if (%Args) {
	182	confess "$0: bad args to Table->New"
	183	}
	184
	185	return $Table;
	186	}
	187
	188	##
	189	## Returns true if the Table has no code points
	190	##
	191	sub Table::IsEmpty
	192	{
	193	my $Table = shift; #self
	194	return not @$Table;
	195	}
	196
	197	##
	198	## Returns true if the Table has code points
	199	##
	200	sub Table::NotEmpty
	201	{
	202	my $Table = shift; #self
	203	return @$Table;
	204	}
	205
	206	##
	207	## Returns the maximum code point currently in the table.
	208	##
	209	sub Table::Max
	210	{
	211	my $Table = shift; #self
	212	confess "oops" if $Table->IsEmpty; ## must have code points to have a max
	213	return $Table->[-1]->[RANGE_END];
	214	}
	215
	216	##
	217	## Replaces the codepoints in the Table with those in the Table given
	218	## as an arg. (NOTE: this is not a "deep copy").
	219	##
	220	sub Table::Replace($$)
	221	{
	222	my $Table = shift; #self
	223	my $New = shift;
	224
	225	@$Table = @$New;
	226	}
	227
	228	##
	229	## Given a new code point, make the last range of the Table extend to
	230	## include the new (and all intervening) code points.
	231	##
	232	sub Table::Extend
	233	{
	234	my $Table = shift; #self
	235	my $codepoint = shift;
	236
	237	my $PrevMax = $Table->Max;
	238
	239	confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax;
	240
	241	$Table->[-1]->[RANGE_END] = $codepoint;
	242	}
	243
	244	##
	245	## Given a code point range start and end (and optional name), blindly
	246	## append them to the list of ranges for the Table.
	247	##
	248	## NOTE: Code points must be added in strictly ascending numeric order.
	249	##
	250	sub Table::RawAppendRange
	251	{
	252	my $Table = shift; #self
	253	my $start = shift;
	254	my $end = shift;
	255	my $name = shift;
	256	$name = "" if not defined $name; ## warning: $name can be "0"
	257
	258	push @$Table, [ $start, # RANGE_START
	259	$end, # RANGE_END
	260	$name ]; # RANGE_NAME
	261	}
	262
	263	##
	264	## Given a code point (and optional name), add it to the Table.
	265	##
	266	## NOTE: Code points must be added in strictly ascending numeric order.
	267	##
	268	sub Table::Append
	269	{
	270	my $Table = shift; #self
	271	my $codepoint = shift;
	272	my $name = shift;
	273	$name = "" if not defined $name; ## warning: $name can be "0"
	274
	275	##
	276	## If we've already got a range working, and this code point is the next
	277	## one in line, and if the name is the same, just extend the current range.
	278	##
	279	if ($Table->NotEmpty
	280	and
	281	$Table->Max == $codepoint - 1
	282	and
	283	$Table->[-1]->[RANGE_NAME] eq $name)
	284	{
	285	$Table->Extend($codepoint);
	286	}
	287	else
	288	{
	289	$Table->RawAppendRange($codepoint, $codepoint, $name);
	290	}
	291	}
	292
	293	##
	294	## Given a code point range starting value and ending value (and name),
	295	## Add the range to teh Table.
	296	##
	297	## NOTE: Code points must be added in strictly ascending numeric order.
	298	##
	299	sub Table::AppendRange
	300	{
	301	my $Table = shift; #self
	302	my $start = shift;
	303	my $end = shift;
	304	my $name = shift;
	305	$name = "" if not defined $name; ## warning: $name can be "0"
	306
	307	$Table->Append($start, $name);
	308	$Table->Extend($end) if $end > $start;
	309	}
	310
	311	##
	312	## Return a new Table that represents all code points not in the Table.
	313	##
	314	sub Table::Invert
	315	{
	316	my $Table = shift; #self
	317
	318	my $New = Table->New();
	319	my $max = -1;
	320	for my $range (@$Table)
	321	{
	322	my $start = $range->[RANGE_START];
	323	my $end = $range->[RANGE_END];
	324	if ($start-1 >= $max+1) {
	325	$New->AppendRange($max+1, $start-1, "");
	326	}
	327	$max = $end;
	328	}
	329	if ($max+1 < $LastUnicodeCodepoint) {
	330	$New->AppendRange($max+1, $LastUnicodeCodepoint);
	331	}
	332	return $New;
	333	}
	334
	335	##
	336	## Merges any number of other tables with $self, returning the new table.
	337	## (existing tables are not modified)
	338	##
	339	##
	340	## Args may be Tables, or individual code points (as integers).
	341	##
	342	## Can be called as either a constructor or a method.
	343	##
	344	sub Table::Merge
	345	{
	346	shift(@_) if not ref $_[0]; ## if called as a constructor, lose the class
	347	my @Tables = @_;
	348
	349	## Accumulate all records from all tables
	350	my @Records;
	351	for my $Arg (@Tables)
	352	{
	353	if (ref $Arg) {
	354	## arg is a table -- get its ranges
	355	push @Records, @$Arg;
	356	} else {
	357	## arg is a codepoint, make a range
	358	push @Records, [ $Arg, $Arg ]
	359	}
	360	}
	361
	362	## sort by range start, with longer ranges coming first.
	363	my ($first, @Rest) = sort {
	364	($a->[RANGE_START] <=> $b->[RANGE_START])
	365	or
	366	($b->[RANGE_END] <=> $b->[RANGE_END])
	367	} @Records;
	368
	369	my $New = Table->New();
	370
	371	## Ensuring the first range is there makes the subsequent loop easier
	372	$New->AppendRange($first->[RANGE_START],
	373	$first->[RANGE_END]);
	374
	375	## Fold in records so long as they add new information.
	376	for my $set (@Rest)
	377	{
	378	my $start = $set->[RANGE_START];
	379	my $end = $set->[RANGE_END];
	380	if ($start > $New->Max) {
	381	$New->AppendRange($start, $end);
	382	} elsif ($end > $New->Max) {
	383	$New->Extend($end);
	384	}
	385	}
	386
	387	return $New;
	388	}
	389
	390	##
	391	## Given a filename, write a representation of the Table to a file.
	392	## May have an optional comment as a 2nd arg.
	393	##
	394	sub Table::Write
	395	{
	396	my $Table = shift; #self
	397	my $filename = shift;
	398	my $comment = shift;
	399
	400	my @OUT = $HEADER;
	401	if (defined $comment) {
	402	$comment =~ s/\s+\Z//;
	403	$comment =~ s/^/# /gm;
	404	push @OUT, "#\n$comment\n#\n";
	405	}
	406	push @OUT, "return <<'END';\n";
	407
	408	for my $set (@$Table)
	409	{
	410	my $start = $set->[RANGE_START];
	411	my $end = $set->[RANGE_END];
	412	my $name = $set->[RANGE_NAME];
	413
	414	if ($start == $end) {
	415	push @OUT, sprintf "%04X\t\t%s\n", $start, $name;
	416	} else {
	417	push @OUT, sprintf "%04X\t%04X\t%s\n", $start, $end, $name;
	418	}
	419	}
	420
	421	push @OUT, "END\n";
	422
	423	WriteIfChanged($filename, @OUT);
	424	}
	425
	426	## This used only for making the test script.
	427	## helper function
	428	sub IsUsable($)
	429	{
	430	my $code = shift;
	431	return 0 if $code <= 0x0000; ## don't use null
	432	return 0 if $code >= $LastUnicodeCodepoint; ## keep in range
	433	return 0 if ($code >= 0xD800 and $code <= 0xDFFF); ## no surrogates
	434	return 0 if ($code >= 0xFDD0 and $code <= 0xFDEF); ## utf8.c says no good
	435	return 0 if (($code & 0xFFFF) == 0xFFFE); ## utf8.c says no good
	436	return 0 if (($code & 0xFFFF) == 0xFFFF); ## utf8.c says no good
	437	return 1;
	438	}
	439
	440	## Return a code point that's part of the table.
	441	## Returns nothing if the table is empty (or covers only surrogates).
	442	## This used only for making the test script.
	443	sub Table::ValidCode
	444	{
	445	my $Table = shift; #self
	446	for my $set (@$Table) {
	447	return $set->[RANGE_END] if IsUsable($set->[RANGE_END]);
	448	}
	449	return ();
	450	}
	451
	452	## Return a code point that's not part of the table
	453	## Returns nothing if the table covers all code points.
	454	## This used only for making the test script.
	455	sub Table::InvalidCode
	456	{
	457	my $Table = shift; #self
	458
	459	return 0x1234 if $Table->IsEmpty();
	460
	461	for my $set (@$Table)
	462	{
	463	if (IsUsable($set->[RANGE_END] + 1))
	464	{
	465	return $set->[RANGE_END] + 1;
	466	}
	467
	468	if (IsUsable($set->[RANGE_START] - 1))
	469	{
	470	return $set->[RANGE_START] - 1;
	471	}
	472	}
	473	return ();
	474	}
	475
	476	###########################################################################
	477	###########################################################################
	478	###########################################################################
	479
	480
	481	##
	482	## Called like:
	483	## New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 1);
	484	##
	485	## The args must be in that order, although the Fuzzy pair may be omitted.
	486	##
	487	## This creates 'IsAll' as an alias for 'IsAny'
	488	##
	489	sub New_Alias($$$@)
	490	{
	491	my $Type = shift; ## "Is" or "In"
	492	my $Alias = shift;
	493	my $SameAs = shift; # expecting "SameAs" -- just ignored
	494	my $Name = shift;
	495
	496	## remaining args are optional key/val
	497	my %Args = @_;
	498
	499	my $Fuzzy = delete $Args{Fuzzy};
	500
	501	## sanity check a few args
	502	if (%Args or ($Type ne 'Is' and $Type ne 'In') or $SameAs ne 'SameAs') {
	503	confess "$0: bad args to New_Alias"
	504	}
	505
	506	$Alias = CanonicalName($Alias) if $Fuzzy;
	507
	508	if (not $TableInfo{$Type}->{$Name})
	509	{
	510	my $CName = CanonicalName($Name);
	511	if ($TableInfo{$Type}->{$CName}) {
	512	confess "$0: Use canonical form '$CName' instead of '$Name' for alias.";
	513	} else {
	514	confess "$0: don't have orignial $Type => $Name to make alias";
	515	}
	516	}
	517	if ($TableInfo{$Alias}) {
	518	confess "$0: already have original $Type => $Alias; can't make alias";
	519	}
	520	$AliasInfo{$Type}->{$Name} = $Alias;
	521	if ($Fuzzy) {
	522	$FuzzyNames{$Type}->{$Alias} = $Name;
	523	}
	524
	525	}
	526
	527
	528	## All assigned code points
	529	my $Assigned = Table->New(Is => 'Assigned',
	530	Desc => "All assigned code points",
	531	Fuzzy => 0);
	532
	533	my $Name = Table->New(); ## all characters, individually by name
	534	my $General = Table->New(); ## all characters, grouped by category
	535	my %General;
	536	my %Cat;
	537
	538	##
	539	## Process UnicodeData.txt (Categories, etc.)
	540	##
	541	sub UnicodeData_Txt()
	542	{
	543	my $Bidi = Table->New();
	544	my $Deco = Table->New();
	545	my $Comb = Table->New();
	546	my $Number = Table->New();
	547	my $Mirrored = Table->New(Is => 'Mirrored',
	548	Desc => "Mirrored in bidirectional text",
	549	Fuzzy => 0);
	550
	551	my %DC;
	552	my %Bidi;
	553	my %Deco;
	554	$Deco{Canon} = Table->New(Is => 'Canon',
	555	Desc => 'Decomposes to multiple characters',
	556	Fuzzy => 0);
	557	$Deco{Compat} = Table->New(Is => 'Compat',
	558	Desc => 'Compatible with a more-basic character',
	559	Fuzzy => 0);
	560
	561	## Initialize Perl-generated categories
	562	## (Categories from UnicodeData.txt are auto-initialized in gencat)
	563	$Cat{Alnum} =
	564	Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0);
	565	$Cat{Alpha} =
	566	Table->New(Is => 'Alpha', Desc => "[[:Alpha:]]", Fuzzy => 0);
	567	$Cat{ASCII} =
	568	Table->New(Is => 'ASCII', Desc => "[[:ASCII:]]", Fuzzy => 0);
	569	$Cat{Blank} =
	570	Table->New(Is => 'Blank', Desc => "[[:Blank:]]", Fuzzy => 0);
	571	$Cat{Cntrl} =
	572	Table->New(Is => 'Cntrl', Desc => "[[:Cntrl:]]", Fuzzy => 0);
	573	$Cat{Digit} =
	574	Table->New(Is => 'Digit', Desc => "[[:Digit:]]", Fuzzy => 0);
	575	$Cat{Graph} =
	576	Table->New(Is => 'Graph', Desc => "[[:Graph:]]", Fuzzy => 0);
	577	$Cat{Lower} =
	578	Table->New(Is => 'Lower', Desc => "[[:Lower:]]", Fuzzy => 0);
	579	$Cat{Print} =
	580	Table->New(Is => 'Print', Desc => "[[:Print:]]", Fuzzy => 0);
	581	$Cat{Punct} =
	582	Table->New(Is => 'Punct', Desc => "[[:Punct:]]", Fuzzy => 0);
	583	$Cat{Space} =
	584	Table->New(Is => 'Space', Desc => "[[:Space:]]", Fuzzy => 0);
	585	$Cat{Title} =
	586	Table->New(Is => 'Title', Desc => "[[:Title:]]", Fuzzy => 0);
	587	$Cat{Upper} =
	588	Table->New(Is => 'Upper', Desc => "[[:Upper:]]", Fuzzy => 0);
	589	$Cat{XDigit} =
	590	Table->New(Is => 'XDigit', Desc => "[[:XDigit:]]", Fuzzy => 0);
	591	$Cat{Word} =
	592	Table->New(Is => 'Word', Desc => "[[:Word:]]", Fuzzy => 0);
	593	$Cat{SpacePerl} =
	594	Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0);
	595
	596	my %To;
	597	$To{Upper} = Table->New();
	598	$To{Lower} = Table->New();
	599	$To{Title} = Table->New();
	600	$To{Digit} = Table->New();
	601
	602	sub gencat($$$$)
	603	{
	604	my ($name, ## Name ("LATIN CAPITAL LETTER A")
	605	$cat, ## Category ("Lu", "Zp", "Nd", etc.)
	606	$code, ## Code point (as an integer)
	607	$op) = @_;
	608
	609	my $MajorCat = substr($cat, 0, 1); ## L, M, Z, S, etc
	610
	611	$Assigned->$op($code);
	612	$Name->$op($code, $name);
	613	$General->$op($code, $cat);
	614
	615	## add to the sub category (e.g. "Lu", "Nd", "Cf", ..)
	616	$Cat{$cat} \|\|= Table->New(Is => $cat,
	617	Desc => "General Category '$cat'",
	618	Fuzzy => 0);
	619	$Cat{$cat}->$op($code);
	620
	621	## add to the major category (e.g. "L", "N", "C", ...)
	622	$Cat{$MajorCat} \|\|= Table->New(Is => $MajorCat,
	623	Desc => "Major Category '$MajorCat'",
	624	Fuzzy => 0);
	625	$Cat{$MajorCat}->$op($code);
	626
	627	($General{$name} \|\|= Table->New)->$op($code, $name);
	628
	629	# 005F: SPACING UNDERSCORE
	630	$Cat{Word}->$op($code) if $cat =~ /^[LMN]\|Pc/;
	631	$Cat{Alnum}->$op($code) if $cat =~ /^[LM]\|Nd/;
	632	$Cat{Alpha}->$op($code) if $cat =~ /^[LM]/;
	633
	634	my $isspace =
	635	($cat =~ /Zs\|Zl\|Zp/ &&
	636	$code != 0x200B) # 200B is ZWSP which is for line break control
	637	# and therefore it is not part of "space" even while it is "Zs".
	638	\|\| $code == 0x0009 # 0009: HORIZONTAL TAB
	639	\|\| $code == 0x000A # 000A: LINE FEED
	640	\|\| $code == 0x000B # 000B: VERTICAL TAB
	641	\|\| $code == 0x000C # 000C: FORM FEED
	642	\|\| $code == 0x000D # 000D: CARRIAGE RETURN
	643	\|\| $code == 0x0085 # 0085: NEL
	644
	645	;
	646
	647	$Cat{Space}->$op($code) if $isspace;
	648
	649	$Cat{SpacePerl}->$op($code) if $isspace
	650	&& $code != 0x000B; # Backward compat.
	651
	652	$Cat{Blank}->$op($code) if $isspace
	653	&& !($code == 0x000A \|\|
	654	$code == 0x000B \|\|
	655	$code == 0x000C \|\|
	656	$code == 0x000D \|\|
	657	$code == 0x0085 \|\|
	658	$cat =~ /^Z[lp]/);
	659
	660	$Cat{Digit}->$op($code) if $cat eq "Nd";
	661	$Cat{Upper}->$op($code) if $cat eq "Lu";
	662	$Cat{Lower}->$op($code) if $cat eq "Ll";
	663	$Cat{Title}->$op($code) if $cat eq "Lt";
	664	$Cat{ASCII}->$op($code) if $code <= 0x007F;
	665	$Cat{Cntrl}->$op($code) if $cat =~ /^C/;
	666	my $isgraph = !$isspace && $cat !~ /Cc\|Cs\|Cn/;
	667	$Cat{Graph}->$op($code) if $isgraph;
	668	$Cat{Print}->$op($code) if $isgraph \|\| $isspace;
	669	$Cat{Punct}->$op($code) if $cat =~ /^P/;
	670
	671	$Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9
	672	\|\| ($code >= 0x41 && $code <= 0x46) ## A..F
	673	\|\| ($code >= 0x61 && $code <= 0x66); ## a..f
	674	}
	675
	676	## open ane read file.....
	677	if (not open IN, "UnicodeData.txt") {
	678	die "$0: UnicodeData.txt: $!\n";
	679	}
	680
	681	##
	682	## For building \p{_CombAbove} and \p{_CanonDCIJ}
	683	##
	684	my %_Above_HexCodes; ## Hexcodes for chars with $comb == 230 ("ABOVE")
	685
	686	my %CodeToDeco; ## Maps code to decomp. list for chars with first
	687	## decomp. char an "i" or "j" (for \p{_CanonDCIJ})
	688
	689	## This is filled in as we go....
	690	my $CombAbove = Table->New(Is => '_CombAbove',
	691	Desc => '(for internal casefolding use)',
	692	Fuzzy => 0);
	693
	694	while (<IN>)
	695	{
	696	next unless /^[0-9A-Fa-f]+;/;
	697	s/\s+$//;
	698
	699	my ($hexcode, ## code point in hex (e.g. "0041")
	700	$name, ## character name (e.g. "LATIN CAPITAL LETTER A")
	701	$cat, ## category (e.g. "Lu")
	702	$comb, ## Canonical combining class (e.t. "230")
	703	$bidi, ## directional category (e.g. "L")
	704	$deco, ## decomposition mapping
	705	$decimal, ## decimal digit value
	706	$digit, ## digit value
	707	$number, ## numeric value
	708	$mirrored, ## mirrored
	709	$unicode10, ## name in Unicode 1.0
	710	$comment, ## comment field
	711	$upper, ## uppercase mapping
	712	$lower, ## lowercase mapping
	713	$title, ## titlecase mapping
	714	) = split(/\s;\s/);
	715
	716	# Note that in Unicode 3.2 there will be names like
	717	# LINE FEED (LF), which probably means that \N{} needs
	718	# to cope also with LINE FEED and LF.
	719	$name = $unicode10 if $name eq '<control>' && $unicode10 ne '';
	720
	721	my $code = hex($hexcode);
	722
	723	if ($comb and $comb == 230) {
	724	$CombAbove->Append($code);
	725	$_Above_HexCodes{$hexcode} = 1;
	726	}
	727
	728	## Used in building \p{_CanonDCIJ}
	729	if ($deco and $deco =~ m/^006[9A]\b/) {
	730	$CodeToDeco{$code} = $deco;
	731	}
	732
	733	##
	734	## There are a few pairs of lines like:
	735	## AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
	736	## D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
	737	## that define ranges.
	738	##
	739	if ($name =~ /^<(.+), (First\|Last)>$/)
	740	{
	741	$name = $1;
	742	gencat($name, $cat, $code, $2 eq 'First' ? 'Append' : 'Extend');
	743	#New_Prop(In => $name, $General{$name}, Fuzzy => 1);
	744	}
	745	else
	746	{
	747	## normal (single-character) lines
	748	gencat($name, $cat, $code, 'Append');
	749
	750	# No Append() here since since several codes may map into one.
	751	$To{Upper}->RawAppendRange($code, $code, $upper) if $upper;
	752	$To{Lower}->RawAppendRange($code, $code, $lower) if $lower;
	753	$To{Title}->RawAppendRange($code, $code, $title) if $title;
	754	$To{Digit}->Append($code, $decimal) if length $decimal;
	755
	756	$Bidi->Append($code, $bidi);
	757	$Comb->Append($code, $comb) if $comb;
	758	$Number->Append($code, $number) if length $number;
	759
	760	$Mirrored->Append($code) if $mirrored eq "Y";
	761
	762	$Bidi{$bidi} \|\|= Table->New(Is => "Bidi$bidi",
	763	Desc => "Bi-directional category '$bidi'",
	764	Fuzzy => 0);
	765	$Bidi{$bidi}->Append($code);
	766
	767	if ($deco)
	768	{
	769	$Deco->Append($code, $deco);
	770	if ($deco =~/^<(\w+)>/)
	771	{
	772	$Deco{Compat}->Append($code);
	773
	774	$DC{$1} \|\|= Table->New(Is => "DC$1",
	775	Desc => "Compatible with '$1'",
	776	Fuzzy => 0);
	777	$DC{$1}->Append($code);
	778	}
	779	else
	780	{
	781	$Deco{Canon}->Append($code);
	782	}
	783	}
	784	}
	785	}
	786	close IN;
	787
	788	##
	789	## Tidy up a few special cases....
	790	##
	791
	792	$Cat{Cn} = $Assigned->Invert; ## Cn is everything that doesn't exist
	793	New_Prop(Is => 'Cn',
	794	$Cat{Cn},
	795	Desc => "General Category 'Cn' [not functional in Perl]",
	796	Fuzzy => 0);
	797
	798	## Unassigned is the same as 'Cn'
	799	New_Alias(Is => 'Unassigned', SameAs => 'Cn', Fuzzy => 0);
	800
	801	$Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C
	802
	803
	804	# L& is Ll, Lu, and Lt.
	805	New_Prop(Is => 'L&',
	806	Table->Merge(@Cat{qw[Ll Lu Lt]}),
	807	Desc => '[\p{Ll}\p{Lu}\p{Lt}]',
	808	Fuzzy => 0);
	809
	810	## Any and All are all code points.
	811	my $Any = Table->New(Is => 'Any',
	812	Desc => sprintf("[\\x{0000}-\\x{%X}]",
	813	$LastUnicodeCodepoint),
	814	Fuzzy => 0);
	815	$Any->RawAppendRange(0, $LastUnicodeCodepoint);
	816
	817	New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 0);
	818
	819	##
	820	## Build special properties for Perl's internal case-folding needs:
	821	## \p{_CaseIgnorable}
	822	## \p{_CanonDCIJ}
	823	## \p{_CombAbove}
	824	## _CombAbove was built above. Others are built here....
	825	##
	826
	827	## \p{_CaseIgnorable} is [\p{Mn}\0x00AD\x2010]
	828	New_Prop(Is => '_CaseIgnorable',
	829	Table->Merge($Cat{Mn},
	830	0x00AD, #SOFT HYPHEN
	831	0x2010), #HYPHEN
	832	Desc => '(for internal casefolding use)',
	833	Fuzzy => 0);
	834
	835
	836	## \p{_CanonDCIJ} is fairly complex...
	837	my $CanonCDIJ = Table->New(Is => '_CanonDCIJ',
	838	Desc => '(for internal casefolding use)',
	839	Fuzzy => 0);
	840	## It contains the ASCII 'i' and 'j'....
	841	$CanonCDIJ->Append(0x0069); # ASCII ord("i")
	842	$CanonCDIJ->Append(0x006A); # ASCII ord("j")
	843	## ...and any character with a decomposition that starts with either of
	844	## those code points, but only if the decomposition does not have any
	845	## combining character with the "ABOVE" canonical combining class.
	846	for my $code (sort { $a <=> $b} keys %CodeToDeco)
	847	{
	848	## Need to ensure that all decomposition characters do not have
	849	## a %HexCodeToComb in %AboveCombClasses.
	850	my $want = 1;
	851	for my $deco_hexcode (split / /, $CodeToDeco{$code})
	852	{
	853	if (exists $_Above_HexCodes{$deco_hexcode}) {
	854	## one of the decmposition chars has an ABOVE combination
	855	## class, so we're not interested in this one
	856	$want = 0;
	857	last;
	858	}
	859	}
	860	if ($want) {
	861	$CanonCDIJ->Append($code);
	862	}
	863	}
	864
	865
	866
	867	##
	868	## Now dump the files.
	869	##
	870	$Name->Write("Name.pl");
	871	$Bidi->Write("Bidirectional.pl");
	872	$Comb->Write("CombiningClass.pl");
	873	$Deco->Write("Decomposition.pl");
	874	$Number->Write("Number.pl");
	875	$General->Write("Category.pl");
	876
	877	for my $to (sort keys %To) {
	878	$To{$to}->Write("To/$to.pl");
	879	}
	880	}
	881
	882	##
	883	## Process LineBreak.txt
	884	##
	885	sub LineBreak_Txt()
	886	{
	887	if (not open IN, "LineBreak.txt") {
	888	die "$0: LineBreak.txt: $!\n";
	889	}
	890
	891	my $Lbrk = Table->New();
	892	my %Lbrk;
	893
	894	while (<IN>)
	895	{
	896	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s;\s(\w+)/;
	897
	898	my ($first, $last, $lbrk) = (hex($1), hex($2\|\|""), $3);
	899
	900	$Lbrk->Append($first, $lbrk);
	901
	902	$Lbrk{$lbrk} \|\|= Table->New(Is => "Lbrk$lbrk",
	903	Desc => "Linebreak category '$lbrk'",
	904	Fuzzy => 0);
	905	$Lbrk{$lbrk}->Append($first);
	906
	907	if ($last) {
	908	$Lbrk->Extend($last);
	909	$Lbrk{$lbrk}->Extend($last);
	910	}
	911	}
	912	close IN;
	913
	914	$Lbrk->Write("Lbrk.pl");
	915	}
	916
	917	##
	918	## Process ArabicShaping.txt.
	919	##
	920	sub ArabicShaping_txt()
	921	{
	922	if (not open IN, "ArabicShaping.txt") {
	923	die "$0: ArabicShaping.txt: $!\n";
	924	}
	925
	926	my $ArabLink = Table->New();
	927	my $ArabLinkGroup = Table->New();
	928
	929	while (<IN>)
	930	{
	931	next unless /^[0-9A-Fa-f]+;/;
	932	s/\s+$//;
	933
	934	my ($hexcode, $name, $link, $linkgroup) = split(/\s;\s/);
	935	my $code = hex($hexcode);
	936	$ArabLink->Append($code, $link);
	937	$ArabLinkGroup->Append($code, $linkgroup);
	938	}
	939	close IN;
	940
	941	$ArabLink->Write("ArabLink.pl");
	942	$ArabLinkGroup->Write("ArabLnkGrp.pl");
	943	}
	944
	945	##
	946	## Process Jamo.txt.
	947	##
	948	sub Jamo_txt()
	949	{
	950	if (not open IN, "Jamo.txt") {
	951	die "$0: Jamo.txt: $!\n";
	952	}
	953	my $Short = Table->New();
	954
	955	while (<IN>)
	956	{
	957	next unless /^([0-9A-Fa-f]+)\s;\s(\w*)/;
	958	my ($code, $short) = (hex($1), $2);
	959
	960	$Short->Append($code, $short);
	961	}
	962	close IN;
	963	$Short->Write("JamoShort.pl");
	964	}
	965
	966	##
	967	## Process Scripts.txt.
	968	##
	969	sub Scripts_txt()
	970	{
	971	my @ScriptInfo;
	972
	973	if (not open(IN, "Scripts.txt")) {
	974	die "$0: Scripts.txt: $!\n";
	975	}
	976	while (<IN>) {
	977	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s;\s(.+?)\s*\#/;
	978
	979	# Wait until all the scripts have been read since
	980	# they are not listed in numeric order.
	981	push @ScriptInfo, [ hex($1), hex($2\|\|""), $3 ];
	982	}
	983	close IN;
	984
	985	# Now append the scripts properties in their code point order.
	986
	987	my %Script;
	988	my $Scripts = Table->New();
	989
	990	for my $script (sort { $a->[0] <=> $b->[0] } @ScriptInfo)
	991	{
	992	my ($first, $last, $name) = @$script;
	993	$Scripts->Append($first, $name);
	994
	995	$Script{$name} \|\|= Table->New(Is => $name,
	996	Desc => "Script '$name'",
	997	Fuzzy => 1);
	998	$Script{$name}->Append($first, $name);
	999
	1000	if ($last) {
	1001	$Scripts->Extend($last);
	1002	$Script{$name}->Extend($last);
	1003	}
	1004	}
	1005
	1006	$Scripts->Write("Scripts.pl");
	1007
	1008	## Common is everything not explicitly assigned to a Script
	1009	##
	1010	## *shouldn't this be intersected with \p{Assigned}? ****
	1011	##
	1012	New_Prop(Is => 'Common',
	1013	$Scripts->Invert,
	1014	Desc => 'Pseudo-Script of codepoints not in other Unicode scripts',
	1015	Fuzzy => 1);
	1016	}
	1017
	1018	##
	1019	## Given a name like "Close Punctuation", return a regex (that when applied
	1020	## with /i) matches any valid form of that name (e.g. "ClosePunctuation",
	1021	## "Close-Punctuation", etc.)
	1022	##
	1023	## Accept any space, dash, or underbar where in the official name there is
	1024	## space or a dash (or underbar, but there never is).
	1025	##
	1026	##
	1027	sub NameToRegex($)
	1028	{
	1029	my $Name = shift;
	1030	$Name =~ s/[- _]/(?:[-_]\|\\s+)?/g;
	1031	return $Name;
	1032	}
	1033
	1034	##
	1035	## Process Blocks.txt.
	1036	##
	1037	sub Blocks_txt()
	1038	{
	1039	my $Blocks = Table->New();
	1040	my %Blocks;
	1041
	1042	if (not open IN, "Blocks.txt") {
	1043	die "$0: Blocks.txt: $!\n";
	1044	}
	1045
	1046	while (<IN>)
	1047	{
	1048	#next if not /Private Use$/;
	1049	next if not /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s;\s(.+?)\s*$/;
	1050
	1051	my ($first, $last, $name) = (hex($1), hex($2), $3);
	1052
	1053	$Blocks->Append($first, $name);
	1054
	1055	$Blocks{$name} \|\|= Table->New(In => $name,
	1056	Desc => "Block '$name'",
	1057	Fuzzy => 1);
	1058	$Blocks{$name}->Append($first, $name);
	1059
	1060	if ($last and $last != $first) {
	1061	$Blocks->Extend($last);
	1062	$Blocks{$name}->Extend($last);
	1063	}
	1064	}
	1065	close IN;
	1066
	1067	$Blocks->Write("Blocks.pl");
	1068	}
	1069
	1070	##
	1071	## Read in the PropList.txt. It contains extended properties not
	1072	## listed in the UnicodeData.txt, such as 'Other_Alphabetic':
	1073	## alphabetic but not of the general category L; many modifiers
	1074	## belong to this extended property category: while they are not
	1075	## alphabets, they are alphabetic in nature.
	1076	##
	1077	sub PropList_txt()
	1078	{
	1079	my @PropInfo;
	1080
	1081	if (not open IN, "PropList.txt") {
	1082	die "$0: PropList.txt: $!\n";
	1083	}
	1084
	1085	while (<IN>)
	1086	{
	1087	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s;\s(.+?)\s*\#/;
	1088
	1089	# Wait until all the extended properties have been read since
	1090	# they are not listed in numeric order.
	1091	push @PropInfo, [ hex($1), hex($2\|\|""), $3 ];
	1092	}
	1093	close IN;
	1094
	1095	# Now append the extended properties in their code point order.
	1096	my $Props = Table->New();
	1097	my %Prop;
	1098
	1099	for my $prop (sort { $a->[0] <=> $b->[0] } @PropInfo)
	1100	{
	1101	my ($first, $last, $name) = @$prop;
	1102	$Props->Append($first, $name);
	1103
	1104	$Prop{$name} \|\|= Table->New(Is => $name,
	1105	Desc => "Extended property '$name'",
	1106	Fuzzy => 1);
	1107	$Prop{$name}->Append($first, $name);
	1108
	1109	if ($last) {
	1110	$Props->Extend($last);
	1111	$Prop{$name}->Extend($last);
	1112	}
	1113	}
	1114
	1115	# Alphabetic is L and Other_Alphabetic.
	1116	New_Prop(Is => 'Alphabetic',
	1117	Table->Merge($Cat{L}, $Prop{Other_Alphabetic}),
	1118	Desc => '[\p{L}\p{OtherAlphabetic}]', # use canonical names here
	1119	Fuzzy => 1);
	1120
	1121	# Lowercase is Ll and Other_Lowercase.
	1122	New_Prop(Is => 'Lowercase',
	1123	Table->Merge($Cat{Ll}, $Prop{Other_Lowercase}),
	1124	Desc => '[\p{Ll}\p{OtherLowercase}]', # use canonical names here
	1125	Fuzzy => 1);
	1126
	1127	# Uppercase is Lu and Other_Uppercase.
	1128	New_Prop(Is => 'Uppercase',
	1129	Table->Merge($Cat{Lu}, $Prop{Other_Uppercase}),
	1130	Desc => '[\p{Lu}\p{Other_Uppercase}]', # use canonical names here
	1131	Fuzzy => 1);
	1132
	1133	# Math is Sm and Other_Math.
	1134	New_Prop(Is => 'Math',
	1135	Table->Merge($Cat{Sm}, $Prop{Other_Math}),
	1136	Desc => '[\p{Sm}\p{OtherMath}]', # use canonical names here
	1137	Fuzzy => 1);
	1138
	1139	# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
	1140	New_Prop(Is => 'ID_Start',
	1141	Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl]}),
	1142	Desc => '[\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]',
	1143	Fuzzy => 1);
	1144
	1145	# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
	1146	New_Prop(Is => 'ID_Continue',
	1147	Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc ]}),
	1148	Desc => '[\p{ID_Start}\p{Mn}\p{Mc}\p{Nd}\p{Pc}]',
	1149	Fuzzy => 1);
	1150	}
	1151
	1152	sub Make_GC_Aliases()
	1153	{
	1154	##
	1155	## The mapping from General Category long forms to short forms is
	1156	## currently hardwired here since no simple data file in the UCD
	1157	## seems to do that. Unicode 3.2 will assumedly correct this.
	1158	##
	1159	my %Is = (
	1160	'Letter' => 'L',
	1161	'Uppercase_Letter' => 'Lu',
	1162	'Lowercase_Letter' => 'Ll',
	1163	'Titlecase_Letter' => 'Lt',
	1164	'Modifier_Letter' => 'Lm',
	1165	'Other_Letter' => 'Lo',
	1166
	1167	'Mark' => 'M',
	1168	'Non_Spacing_Mark' => 'Mn',
	1169	'Spacing_Mark' => 'Mc',
	1170	'Enclosing_Mark' => 'Me',
	1171
	1172	'Separator' => 'Z',
	1173	'Space_Separator' => 'Zs',
	1174	'Line_Separator' => 'Zl',
	1175	'Paragraph_Separator' => 'Zp',
	1176
	1177	'Number' => 'N',
	1178	'Decimal_Number' => 'Nd',
	1179	'Letter_Number' => 'Nl',
	1180	'Other_Number' => 'No',
	1181
	1182	'Punctuation' => 'P',
	1183	'Connector_Punctuation' => 'Pc',
	1184	'Dash_Punctuation' => 'Pd',
	1185	'Open_Punctuation' => 'Ps',
	1186	'Close_Punctuation' => 'Pe',
	1187	'Initial_Punctuation' => 'Pi',
	1188	'Final_Punctuation' => 'Pf',
	1189	'Other_Punctuation' => 'Po',
	1190
	1191	'Symbol' => 'S',
	1192	'Math_Symbol' => 'Sm',
	1193	'Currency_Symbol' => 'Sc',
	1194	'Modifier_Symbol' => 'Sk',
	1195	'Other_Symbol' => 'So',
	1196
	1197	'Other' => 'C',
	1198	'Control' => 'Cc',
	1199	'Format' => 'Cf',
	1200	'Surrogate' => 'Cs',
	1201	'Private Use' => 'Co',
	1202	'Unassigned' => 'Cn',
	1203	);
	1204
	1205	## make the aliases....
	1206	while (my ($Alias, $Name) = each %Is) {
	1207	New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1);
	1208	}
	1209	}
	1210
	1211
	1212	##
	1213	## These are used in:
	1214	## MakePropTestScript()
	1215	## WriteAllMappings()
	1216	## for making the test script.
	1217	##
	1218	my %FuzzyNameToTest;
	1219	my %ExactNameToTest;
	1220
	1221
	1222	## This used only for making the test script
	1223	sub GenTests($$$$)
	1224	{
	1225	my $FH = shift;
	1226	my $Prop = shift;
	1227	my $MatchCode = shift;
	1228	my $FailCode = shift;
	1229
	1230	if (defined $MatchCode) {
	1231	printf $FH qq/Expect(1, "\\x{%04X}", '\\p{$Prop}' );\n/, $MatchCode;
	1232	printf $FH qq/Expect(0, "\\x{%04X}", '\\p{^$Prop}');\n/, $MatchCode;
	1233	printf $FH qq/Expect(0, "\\x{%04X}", '\\P{$Prop}' );\n/, $MatchCode;
	1234	printf $FH qq/Expect(1, "\\x{%04X}", '\\P{^$Prop}');\n/, $MatchCode;
	1235	}
	1236	if (defined $FailCode) {
	1237	printf $FH qq/Expect(0, "\\x{%04X}", '\\p{$Prop}' );\n/, $FailCode;
	1238	printf $FH qq/Expect(1, "\\x{%04X}", '\\p{^$Prop}');\n/, $FailCode;
	1239	printf $FH qq/Expect(1, "\\x{%04X}", '\\P{$Prop}' );\n/, $FailCode;
	1240	printf $FH qq/Expect(0, "\\x{%04X}", '\\P{^$Prop}');\n/, $FailCode;
	1241	}
	1242	}
	1243
	1244	## This used only for making the test script
	1245	sub ExpectError($$)
	1246	{
	1247	my $FH = shift;
	1248	my $prop = shift;
	1249
	1250	print $FH qq/Error('\\p{$prop}');\n/;
	1251	print $FH qq/Error('\\P{$prop}');\n/;
	1252	}
	1253
	1254	## This used only for making the test script
	1255	my @GoodSeps = (
	1256	" ",
	1257	"-",
	1258	" \t ",
	1259	"",
	1260	"",
	1261	"_",
	1262	);
	1263	my @BadSeps = (
	1264	"--",
	1265	"__",
	1266	" _",
	1267	"/"
	1268	);
	1269
	1270	## This used only for making the test script
	1271	sub RandomlyFuzzifyName($;$)
	1272	{
	1273	my $Name = shift;
	1274	my $WantError = shift; ## if true, make an error
	1275
	1276	my @parts;
	1277	for my $part (split /[-\s_]+/, $Name)
	1278	{
	1279	if (@parts) {
	1280	if ($WantError and rand() < 0.3) {
	1281	push @parts, $BadSeps[rand(@BadSeps)];
	1282	$WantError = 0;
	1283	} else {
	1284	push @parts, $GoodSeps[rand(@GoodSeps)];
	1285	}
	1286	}
	1287	my $switch = int rand(4);
	1288	if ($switch == 0) {
	1289	push @parts, uc $part;
	1290	} elsif ($switch == 1) {
	1291	push @parts, lc $part;
	1292	} elsif ($switch == 2) {
	1293	push @parts, ucfirst $part;
	1294	} else {
	1295	push @parts, $part;
	1296	}
	1297	}
	1298	my $new = join('', @parts);
	1299
	1300	if ($WantError) {
	1301	if (rand() >= 0.5) {
	1302	$new .= $BadSeps[rand(@BadSeps)];
	1303	} else {
	1304	$new = $BadSeps[rand(@BadSeps)] . $new;
	1305	}
	1306	}
	1307	return $new;
	1308	}
	1309
	1310	## This used only for making the test script
	1311	sub MakePropTestScript()
	1312	{
	1313	## this written directly -- it's huge.
	1314	if (not open OUT, ">TestProp.pl") {
	1315	die "$0: TestProp.pl: $!\n";
	1316	}
	1317	print OUT <DATA>;
	1318
	1319	while (my ($Name, $Table) = each %ExactNameToTest)
	1320	{
	1321	GenTests(*OUT, $Name, $Table->ValidCode, $Table->InvalidCode);
	1322	ExpectError(*OUT, uc $Name) if uc $Name ne $Name;
	1323	ExpectError(*OUT, lc $Name) if lc $Name ne $Name;
	1324	}
	1325
	1326
	1327	while (my ($Name, $Table) = each %FuzzyNameToTest)
	1328	{
	1329	my $Orig = $CanonicalToOrig{$Name};
	1330	my %Names = (
	1331	$Name => 1,
	1332	$Orig => 1,
	1333	RandomlyFuzzifyName($Orig) => 1
	1334	);
	1335
	1336	for my $N (keys %Names) {
	1337	GenTests(*OUT, $N, $Table->ValidCode, $Table->InvalidCode);
	1338	}
	1339
	1340	ExpectError(*OUT, RandomlyFuzzifyName($Orig, 'ERROR'));
	1341	}
	1342
	1343	print OUT "Finished();\n";
	1344	close OUT;
	1345	}
	1346
	1347
	1348	##
	1349	## These are used only in:
	1350	## RegisterFileForName()
	1351	## WriteAllMappings()
	1352	##
	1353	my %Exact; ## will become %utf8::Exact;
	1354	my %Canonical; ## will become %utf8::Canonical;
	1355	my %CaComment; ## Comment for %Canonical entry of same key
	1356
	1357	##
	1358	## Given info about a name and a datafile that it should be associated with,
	1359	## register that assocation in %Exact and %Canonical.
	1360	sub RegisterFileForName($$$$)
	1361	{
	1362	my $Type = shift;
	1363	my $Name = shift;
	1364	my $IsFuzzy = shift;
	1365	my $filename = shift;
	1366
	1367	##
	1368	## Now in details for the mapping. $Type eq 'Is' has the
	1369	## Is removed, as it will be removed in utf8_heavy when this
	1370	## data is being checked. In keeps its "In", but a second
	1371	## sans-In record is written if it doesn't conflict with
	1372	## anything already there.
	1373	##
	1374	if (not $IsFuzzy)
	1375	{
	1376	if ($Type eq 'Is') {
	1377	die "oops[$Name]" if $Exact{$Name};
	1378	$Exact{$Name} = $filename;
	1379	} else {
	1380	die "oops[$Type$Name]" if $Exact{"$Type$Name"};
	1381	$Exact{"$Type$Name"} = $filename;
	1382	$Exact{$Name} = $filename if not $Exact{$Name};
	1383	}
	1384	}
	1385	else
	1386	{
	1387	my $CName = lc $Name;
	1388	if ($Type eq 'Is') {
	1389	die "oops[$CName]" if $Canonical{$CName};
	1390	$Canonical{$CName} = $filename;
	1391	$CaComment{$CName} = $Name if $Name =~ tr/A-Z// >= 2;
	1392	} else {
	1393	die "oops[$Type$CName]" if $Canonical{lc "$Type$CName"};
	1394	$Canonical{lc "$Type$CName"} = $filename;
	1395	$CaComment{lc "$Type$CName"} = "$Type$Name";
	1396	if (not $Canonical{$CName}) {
	1397	$Canonical{$CName} = $filename;
	1398	$CaComment{$CName} = "$Type$Name";
	1399	}
	1400	}
	1401	}
	1402	}
	1403
	1404	##
	1405	## Writes the info accumulated in
	1406	##
	1407	## %TableInfo;
	1408	## %FuzzyNames;
	1409	## %AliasInfo;
	1410	##
	1411	##
	1412	sub WriteAllMappings()
	1413	{
	1414	my @MAP;
	1415
	1416	my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts)
	1417
	1418	## 'Is' MUST come first, so its names have precidence over 'In's
	1419	for my $Type ('Is', 'In')
	1420	{
	1421	my %RawNameToFile; ## a per-$Type cache
	1422
	1423	for my $Name (sort {length $a <=> length $b} keys %{$TableInfo{$Type}})
	1424	{
	1425	## Note: $Name is already canonical
	1426	my $Table = $TableInfo{$Type}->{$Name};
	1427	my $IsFuzzy = $FuzzyNames{$Type}->{$Name};
	1428
	1429	## Need an 8.3 safe filename (which means "an 8 safe" $filename)
	1430	my $filename;
	1431	{
	1432	## 'Is' items lose 'Is' from the basename.
	1433	$filename = $Type eq 'Is' ? $Name : "$Type$Name";
	1434
	1435	$filename =~ s/[^\w_]+/_/g; # "L&" -> "L_"
	1436	substr($filename, 8) = '' if length($filename) > 8;
	1437
	1438	##
	1439	## Make sure the basename doesn't conflict with something we
	1440	## might have already written. If we have, say,
	1441	## InGreekExtended1
	1442	## InGreekExtended2
	1443	## they become
	1444	## InGreekE
	1445	## InGreek2
	1446	##
	1447	while (my $num = $BaseNames{lc $filename}++)
	1448	{
	1449	$num++; ## so basenames with numbers start with '2', which
	1450	## just looks more natural.
	1451	## Want to append $num, but if it'll make the basename longer
	1452	## than 8 characters, pre-truncate $filename so that the result
	1453	## is acceptable.
	1454	my $delta = length($filename) + length($num) - 8;
	1455	if ($delta > 0) {
	1456	substr($filename, -$delta) = $num;
	1457	} else {
	1458	$filename .= $num;
	1459	}
	1460	}
	1461	};
	1462
	1463	##
	1464	## Construct a nice comment to add to the file, and build data
	1465	## for the "./Properties" file along the way.
	1466	##
	1467	my $Comment;
	1468	{
	1469	my $Desc = $TableDesc{$Type}->{$Name} \|\| "";
	1470	## get list of names this table is reference by
	1471	my @Supported = $Name;
	1472	while (my ($Orig, $Alias) = each %{ $AliasInfo{$Type} })
	1473	{
	1474	if ($Orig eq $Name) {
	1475	push @Supported, $Alias;
	1476	}
	1477	}
	1478
	1479	my $TypeToShow = $Type eq 'Is' ? "" : $Type;
	1480	my $OrigProp;
	1481
	1482	$Comment = "This file supports:\n";
	1483	for my $N (@Supported)
	1484	{
	1485	my $IsFuzzy = $FuzzyNames{$Type}->{$N};
	1486	my $Prop = "\\p{$TypeToShow$Name}";
	1487	$OrigProp = $Prop if not $OrigProp; #cache for aliases
	1488	if ($IsFuzzy) {
	1489	$Comment .= "\t$Prop (and fuzzy permutations)\n";
	1490	} else {
	1491	$Comment .= "\t$Prop\n";
	1492	}
	1493	my $MyDesc = ($N eq $Name) ? $Desc : "Alias for $OrigProp ($Desc)";
	1494
	1495	push @MAP, sprintf("%s %-42s %s\n",
	1496	$IsFuzzy ? '*' : ' ', $Prop, $MyDesc);
	1497	}
	1498	if ($Desc) {
	1499	$Comment .= "\nMeaning: $Desc\n";
	1500	}
	1501
	1502	}
	1503	##
	1504	## Okay, write the file...
	1505	##
	1506	$Table->Write("lib/$filename.pl", $Comment);
	1507
	1508	## and register it
	1509	$RawNameToFile{$Name} = $filename;
	1510	RegisterFileForName($Type => $Name, $IsFuzzy, $filename);
	1511
	1512	if ($IsFuzzy)
	1513	{
	1514	my $CName = CanonicalName($Type . '_'. $Name);
	1515	$FuzzyNameToTest{$Name} = $Table if !$FuzzyNameToTest{$Name};
	1516	$FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName};
	1517	} else {
	1518	$ExactNameToTest{$Name} = $Table;
	1519	}
	1520
	1521	}
	1522
	1523	## Register aliase info
	1524	for my $Name (sort {length $a <=> length $b} keys %{$AliasInfo{$Type}})
	1525	{
	1526	my $Alias = $AliasInfo{$Type}->{$Name};
	1527	my $IsFuzzy = $FuzzyNames{$Type}->{$Alias};
	1528	my $filename = $RawNameToFile{$Name};
	1529	die "oops [$Alias]->[$Name]" if not $filename;
	1530	RegisterFileForName($Type => $Alias, $IsFuzzy, $filename);
	1531
	1532	my $Table = $TableInfo{$Type}->{$Name};
	1533	die "oops" if not $Table;
	1534	if ($IsFuzzy)
	1535	{
	1536	my $CName = CanonicalName($Type .'_'. $Alias);
	1537	$FuzzyNameToTest{$Alias} = $Table if !$FuzzyNameToTest{$Alias};
	1538	$FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName};
	1539	} else {
	1540	$ExactNameToTest{$Alias} = $Table;
	1541	}
	1542	}
	1543	}
	1544
	1545	##
	1546	## Write out the property list
	1547	##
	1548	{
	1549	my @OUT = (
	1550	"##\n",
	1551	"## This file created by $0\n",
	1552	"## List of built-in \\p{...}/\\P{...} properties.\n",
	1553	"##\n",
	1554	"## '*' means name may be 'fuzzy'\n",
	1555	"##\n\n",
	1556	sort { substr($a,2) cmp substr($b, 2) } @MAP,
	1557	);
	1558	WriteIfChanged('Properties', @OUT);
	1559	}
	1560
	1561	use Text::Tabs (); ## using this makes the files about half the size
	1562
	1563	## Write Exact.pl
	1564	{
	1565	my @OUT = (
	1566	$HEADER,
	1567	"##\n",
	1568	"## Data in this file used by ../utf8_heavy.pl\n",
	1569	"##\n\n",
	1570	"## Mapping from name to filename in ./lib\n",
	1571	"%utf8::Exact = (\n",
	1572	);
	1573
	1574	for my $Name (sort keys %Exact)
	1575	{
	1576	my $File = $Exact{$Name};
	1577	$Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name ";
	1578	my $Text = sprintf("%-15s => %s,\n", $Name, qq/'$File'/);
	1579	push @OUT, Text::Tabs::unexpand($Text);
	1580	}
	1581	push @OUT, ");\n1;\n";
	1582
	1583	WriteIfChanged('Exact.pl', @OUT);
	1584	}
	1585
	1586	## Write Canonical.pl
	1587	{
	1588	my @OUT = (
	1589	$HEADER,
	1590	"##\n",
	1591	"## Data in this file used by ../utf8_heavy.pl\n",
	1592	"##\n\n",
	1593	"## Mapping from lc(canonical name) to filename in ./lib\n",
	1594	"%utf8::Canonical = (\n",
	1595	);
	1596	my $Trail = ""; ## used just to keep the spacing pretty
	1597	for my $Name (sort keys %Canonical)
	1598	{
	1599	my $File = $Canonical{$Name};
	1600	if ($CaComment{$Name}) {
	1601	push @OUT, "\n" if not $Trail;
	1602	push @OUT, " # $CaComment{$Name}\n";
	1603	$Trail = "\n";
	1604	} else {
	1605	$Trail = "";
	1606	}
	1607	$Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name ";
	1608	my $Text = sprintf(" %-41s => %s,\n$Trail", $Name, qq/'$File'/);
	1609	push @OUT, Text::Tabs::unexpand($Text);
	1610	}
	1611	push @OUT, ");\n1\n";
	1612	WriteIfChanged('Canonical.pl', @OUT);
	1613	}
	1614
	1615	MakePropTestScript() if $MakeTestScript;
	1616	}
	1617
	1618
	1619	sub SpecialCasing_txt()
	1620	{
	1621	#
	1622	# Read in the special cases.
	1623	#
	1624
	1625	my %CaseInfo;
	1626
	1627	if (not open IN, "SpecialCasing.txt") {
	1628	die "$0: SpecialCasing.txt: $!\n";
	1629	}
	1630	while (<IN>) {
	1631	next unless /^[0-9A-Fa-f]+;/;
	1632	s/\#.*//;
	1633	s/\s+$//;
	1634
	1635	my ($code, $lower, $title, $upper, $condition) = split(/\s;\s/);
	1636
	1637	if ($condition) { # not implemented yet
	1638	print "# SKIPPING $_\n" if $Verbose;
	1639	next;
	1640	}
	1641
	1642	# Wait until all the special cases have been read since
	1643	# they are not listed in numeric order.
	1644	my $ix = hex($code);
	1645	push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ]
	1646	unless $code eq $lower;
	1647	push @{$CaseInfo{Title}}, [ $ix, $code, $title ]
	1648	unless $code eq $title;
	1649	push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ]
	1650	unless $code eq $upper;
	1651	}
	1652	close IN;
	1653
	1654	# Now write out the special cases properties in their code point order.
	1655	# Prepend them to the To/{Upper,Lower,Title}.pl.
	1656
	1657	for my $case (qw(Lower Title Upper))
	1658	{
	1659	my $NormalCase = do "To/$case.pl" \|\| die "$0: $@\n";
	1660
	1661	my @OUT = (
	1662	$HEADER, "\n",
	1663	"%utf8::ToSpec$case =\n(\n",
	1664	);
	1665
	1666	for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
	1667	my ($ix, $code, $to) = @$prop;
	1668	my $tostr =
	1669	join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
	1670	push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix;
	1671	# Remove any single-character mappings for
	1672	# the same character since we are going for
	1673	# the special casing rules.
	1674	$NormalCase =~ s/^$code\t\t\w+\n//m;
	1675	}
	1676	push @OUT, (
	1677	");\n\n",
	1678	"return <<'END';\n",
	1679	$NormalCase,
	1680	"END\n"
	1681	);
	1682	WriteIfChanged("To/$case.pl", @OUT);
	1683	}
	1684	}
	1685
	1686	#
	1687	# Read in the case foldings.
	1688	#
	1689	# We will do full case folding, C + F + I (see CaseFolding.txt).
	1690	#
	1691	sub CaseFolding_txt()
	1692	{
	1693	if (not open IN, "CaseFolding.txt") {
	1694	die "$0: CaseFolding.txt: $!\n";
	1695	}
	1696
	1697	my $Fold = Table->New();
	1698	my %Fold;
	1699
	1700	while (<IN>) {
	1701	# Skip status 'S', simple case folding
	1702	next unless /^([0-9A-Fa-f]+)\s;\s([CFI])\s;\s([0-9A-Fa-f]+(?: [0-9A-Fa-f]+))\s;/;
	1703
	1704	my ($code, $status, $fold) = (hex($1), $2, $3);
	1705
	1706	if ($status eq 'C') { # Common: one-to-one folding
	1707	# No append() since several codes may fold into one.
	1708	$Fold->RawAppendRange($code, $code, $fold);
	1709	} else { # F: full, or I: dotted uppercase I -> dotless lowercase I
	1710	$Fold{$code} = $fold;
	1711	}
	1712	}
	1713	close IN;
	1714
	1715	$Fold->Write("To/Fold.pl");
	1716
	1717	#
	1718	# Prepend the special foldings to the common foldings.
	1719	#
	1720	my $CommonFold = do "To/Fold.pl" \|\| die "$0: To/Fold.pl: $!\n";
	1721
	1722	my @OUT = (
	1723	$HEADER, "\n",
	1724	"%utf8::ToSpecFold =\n(\n",
	1725	);
	1726	for my $code (sort { $a <=> $b } keys %Fold) {
	1727	my $foldstr =
	1728	join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
	1729	push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code;
	1730	}
	1731	push @OUT, (
	1732	");\n\n",
	1733	"return <<'END';\n",
	1734	$CommonFold,
	1735	"END\n",
	1736	);
	1737
	1738	WriteIfChanged("To/Fold.pl", @OUT);
	1739	}
	1740
	1741	## Do it....
	1742
	1743	UnicodeData_Txt();
	1744	Make_GC_Aliases();
	1745	PropList_txt();
	1746
	1747	Scripts_txt();
	1748	Blocks_txt();
	1749
	1750	WriteAllMappings();
	1751
	1752	LineBreak_Txt();
	1753	ArabicShaping_txt();
	1754	Jamo_txt();
	1755	SpecialCasing_txt();
	1756	CaseFolding_txt();
	1757
	1758	exit(0);
	1759
	1760	## TRAILING CODE IS USED BY MakePropTestScript()
	1761	__DATA__
	1762	use strict;
	1763	use warnings;
	1764
	1765	my $Tests = 0;
	1766	my $Fails = 0;
	1767
	1768	sub Expect($$$)
	1769	{
	1770	my $Expect = shift;
	1771	my $String = shift;
	1772	my $Regex = shift;
	1773	my $Line = (caller)[2];
	1774
	1775	$Tests++;
	1776	my $RegObj;
	1777	my $result = eval {
	1778	$RegObj = qr/$Regex/;
	1779	$String =~ $RegObj ? 1 : 0
	1780	};
	1781
	1782	if (not defined $result) {
	1783	print "couldn't compile /$Regex/ on $0 line $Line: $@\n";
	1784	$Fails++;
	1785	} elsif ($result ^ $Expect) {
	1786	print "bad result (expected $Expect) on $0 line $Line: $@\n";
	1787	$Fails++;
	1788	}
	1789	}
	1790
	1791	sub Error($)
	1792	{
	1793	my $Regex = shift;
	1794	$Tests++;
	1795	if (eval { 'x' =~ qr/$Regex/; 1 }) {
	1796	$Fails++;
	1797	my $Line = (caller)[2];
	1798	print "expected error for /$Regex/ on $0 line $Line: $@\n";
	1799	}
	1800	}
	1801
	1802	sub Finished()
	1803	{
	1804	if ($Fails == 0) {
	1805	print "All $Tests tests passed.\n";
	1806	exit(0);
	1807	} else {
	1808	print "$Tests tests, $Fails failed!\n";
	1809	exit(-1);
	1810	}
	1811	}