perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!./perl
	2	#
	3	# Tests that have to do with checking whether characters have (or not have)
	4	# certain Unicode properties; belong (or not belong) to blocks, scripts, etc.
	5	#
	6
	7	use strict;
	8	use warnings;
	9	use 5.010;
	10
	11	BEGIN {
	12	chdir 't' if -d 't';
	13	require './test.pl';
	14	skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
	15	}
	16
	17	sub run_tests;
	18
	19	#
	20	# This is the data to test.
	21	#
	22	# This is a hash; keys are the property to test.
	23	# Values are arrays containing characters to test. The characters can
	24	# have the following formats:
	25	# '\N{CHARACTER NAME}' - Use character with that name
	26	# '\x{1234}' - Use character with that hex escape
	27	# '0x1234' - Use chr() to get that character
	28	# "a" - Character to use
	29	#
	30	# If a character entry starts with ! the character does not belong to the class
	31	#
	32	# If the class is just single letter, we use both \pL and \p{L}
	33	#
	34
	35	use charnames ':full';
	36
	37	my @CLASSES = (
	38	L => ["a", "A"],
	39	Ll => ["b", "!B"],
	40	Lu => ["!c", "C"],
	41	IsLl => ["d", "!D"],
	42	IsLu => ["!e", "E"],
	43	LC => ["f", "!1"],
	44	'L&' => ["g", "!2"],
	45	'Lowercase Letter' => ["h", "!H"],
	46
	47	Common => ["!i", "3"],
	48	Inherited => ["!j", '\x{300}'],
	49
	50	InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'],
	51	InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'],
	52	InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'],
	53	InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'],
	54	InKatakana => ['\N{KATAKANA LETTER SMALL A}'],
	55	IsLatin => ["0x100", "0x212b"],
	56	IsHebrew => ["0x5d0", "0xfb4f"],
	57	IsGreek => ["0x37a", "0x386", "!0x387", "0x388",
	58	"0x38a", "!0x38b", "0x38c"],
	59	HangulSyllables => ['\x{AC00}'],
	60	'Script=Latin' => ['\x{0100}'],
	61	'Block=LatinExtendedA' => ['\x{0100}'],
	62	'Category=UppercaseLetter' => ['\x{0100}'],
	63
	64	#
	65	# It's ok to repeat class names.
	66	#
	67	InLatin1Supplement =>
	68	['!\N{U+7f}', '\N{U+80}', '\N{U+ff}', '!\x{100}'],
	69	InLatinExtendedA =>
	70	['!\N{U+7f}', '!\N{U+80}', '!\N{U+ff}', '\x{100}'],
	71
	72	#
	73	# Properties are case-insensitive, and may have whitespace,
	74	# dashes and underscores.
	75	#
	76	'in-latin1_SUPPLEMENT' => ['\N{U+80}',
	77	'\N{LATIN SMALL LETTER Y WITH DIAERESIS}'],
	78	' ^ In Latin 1 Supplement '
	79	=> ['!\N{U+80}', '\N{COFFIN}'],
	80	'latin-1 supplement' => ['\N{U+80}', "0xDF"],
	81
	82	);
	83
	84	my @USER_DEFINED_PROPERTIES = (
	85	#
	86	# User defined properties
	87	#
	88	InKana1 => ['\x{3040}', '!\x{303F}'],
	89	InKana2 => ['\x{3040}', '!\x{303F}'],
	90	InKana3 => ['\x{3041}', '!\x{3040}'],
	91	InNotKana => ['\x{3040}', '!\x{3041}'],
	92	InConsonant => ['d', '!e'],
	93	IsSyriac1 => ['\x{0712}', '!\x{072F}'],
	94	IsSyriac1KanaMark => ['\x{309A}', '!\x{3090}'],
	95	IsSyriac1KanaMark => ['\x{0730}', '!\x{0712}'],
	96	'# User-defined character properties may lack \n at the end',
	97	InGreekSmall => ['\N{GREEK SMALL LETTER PI}',
	98	'\N{GREEK SMALL LETTER FINAL SIGMA}'],
	99	InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'],
	100	Dash => ['-'],
	101	ASCII_Hex_Digit => ['!-', 'A'],
	102	IsAsciiHexAndDash => ['-', 'A'],
	103	);
	104
	105	my @USER_CASELESS_PROPERTIES = (
	106	#
	107	# User defined properties which differ depending on /i. Second entry is
	108	# false normally, true under /i
	109	#
	110	'IsMyUpper' => ["M", "!m" ],
	111	);
	112
	113
	114	#
	115	# From the short properties we populate POSIX-like classes.
	116	#
	117	my %SHORT_PROPERTIES = (
	118	'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'],
	119	'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'],
	120	'Lo' => ['\N{HIRAGANA LETTER SMALL A}'],
	121	# is also in other alphabetic
	122	'Mn' => ['\N{HEBREW POINT RAFE}'],
	123	'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'],
	124	'Pc' => ["_"],
	125	'Po' => ["!"],
	126	'Zs' => [" "],
	127	'Cc' => ['\x{00}'],
	128	);
	129
	130	#
	131	# Illegal properties
	132	#
	133	my @ILLEGAL_PROPERTIES =
	134	qw[q qrst f foo isfoo infoo ISfoo INfoo Is::foo In::foo];
	135
	136	my %d;
	137
	138	while (my ($class, $chars) = each %SHORT_PROPERTIES) {
	139	push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars;
	140	push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars;
	141	push @{$d {IsASCII}} => map {length ($_) == 1 \|\| $_ eq '\x{00}'
	142	? $_ : "!$_"} @$chars;
	143	push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars;
	144	push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars;
	145	push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars;
	146	push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]\|Co)/
	147	? $_ : "!$_"} @$chars;
	148	push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]\|Co\|Zs)/
	149	? $_ : "!$_"} @$chars;
	150	push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars;
	151	push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars;
	152	push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars;
	153	push @{$d {IsWord}} => map {$class =~ /^[LMN]/ \|\| $_ eq "_"
	154	? $_ : "!$_"} @$chars;
	155	push @{$d {IsSpace}} => map {$class =~ /^Z/ \|\|
	156	length ($_) == 1 && utf8::native_to_unicode(ord ($_)) >= 0x09
	157	&& utf8::native_to_unicode(ord ($_)) <= 0x0D
	158	? $_ : "!$_"} @$chars;
	159	}
	160
	161	push @CLASSES => "# Short properties" => %SHORT_PROPERTIES,
	162	"# POSIX like properties" => %d,
	163	"# User defined properties" => @USER_DEFINED_PROPERTIES;
	164
	165
	166	#
	167	# Calculate the number of tests.
	168	#
	169	my $count = 0;
	170	for (my $i = 0; $i < @CLASSES; $i += 2) {
	171	$i ++, redo if $CLASSES [$i] =~ /^\h#\h(.*)/;
	172	$count += 2 * (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]};
	173	}
	174	$count += 4 * @ILLEGAL_PROPERTIES;
	175	$count += 4 * grep {length $_ == 1} @ILLEGAL_PROPERTIES;
	176	$count += 8 * @USER_CASELESS_PROPERTIES;
	177	$count += 1; # Test for pkg:IsMyLower
	178
	179	plan(tests => $count);
	180
	181	run_tests unless caller ();
	182
	183	sub match {
	184	my ($char, $match, $nomatch, $caseless) = @_;
	185	$caseless = "" unless defined $caseless;
	186	$caseless = 'i' if $caseless;
	187
	188	my ($str, $name);
	189
	190	if ($char =~ /^\\/) {
	191	$str = eval qq ["$char"];
	192	$name = qq ["$char"];
	193	}
	194	elsif ($char =~ /^0x([0-9A-Fa-f]+)$/) {
	195	$str = chr hex $1;
	196	$name = "chr ($char)";
	197	}
	198	else {
	199	$str = $char;
	200	$name = qq ["$char"];
	201	}
	202
	203	undef $@;
	204	my $pat = "qr/$match/$caseless";
	205	my $match_pat = eval $pat;
	206	is($@, '', "$pat compiled correctly to a regexp: $@");
	207	like($str, $match_pat, "$name correctly matched");
	208
	209	undef $@;
	210	$pat = "qr/$nomatch/$caseless";
	211	my $nomatch_pat = eval $pat;
	212	is($@, '', "$pat compiled correctly to a regexp: $@");
	213	unlike($str, $nomatch_pat, "$name correctly did not match");
	214	}
	215
	216	sub run_tests {
	217
	218	while (@CLASSES) {
	219	my $class = shift @CLASSES;
	220	if ($class =~ /^\h#\h(.*)/) {
	221	print "# $1\n";
	222	next;
	223	}
	224	last unless @CLASSES;
	225	my $chars = shift @CLASSES;
	226	my @in = grep {!/^!./} @$chars;
	227	my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars;
	228	my $in_pat = eval qq ['\\p{$class}'];
	229	my $out_pat = eval qq ['\\P{$class}'];
	230
	231	match $_, $in_pat, $out_pat for @in;
	232	match $_, $out_pat, $in_pat for @out;
	233
	234	if (1 == length $class) { # Repeat without braces if name length 1
	235	my $in_pat = eval qq ['\\p$class'];
	236	my $out_pat = eval qq ['\\P$class'];
	237
	238	match $_, $in_pat, $out_pat for @in;
	239	match $_, $out_pat, $in_pat for @out;
	240	}
	241	}
	242
	243
	244	print "# Illegal properties\n";
	245	foreach my $p (@ILLEGAL_PROPERTIES) {
	246	my $pat;
	247	if ($p =~ /::/) {
	248	$pat = qr /^Illegal user-defined property name/;
	249	}
	250	else {
	251	$pat = qr /^Can't find Unicode property definition/;
	252	}
	253
	254	undef $@;
	255	my $r = eval "'a' =~ /\\p{$p}/; 1";
	256	is($r, undef, "Unknown Unicode property \\p{$p}");
	257	like($@, $pat, "Unknown Unicode property \\p{$p}");
	258	undef $@;
	259	my $s = eval "'a' =~ /\\P{$p}/; 1";
	260	is($s, undef, "Unknown Unicode property \\p{$p}");
	261	like($@, $pat, "Unknown Unicode property \\p{$p}");
	262	if (length $p == 1) {
	263	undef $@;
	264	my $r = eval "'a' =~ /\\p$p/; 1";
	265	is($r, undef, "Unknown Unicode property \\p$p");
	266	like($@, $pat, "Unknown Unicode property \\p$p");
	267	undef $@;
	268	my $s = eval "'a' =~ /\\P$p/; 1";
	269	is($r, undef, "Unknown Unicode property \\P$p");
	270	like($@, $pat, "Unknown Unicode property \\P$p");
	271	}
	272	}
	273
	274	print "# User-defined properties with /i differences\n";
	275	while (my $class = shift @USER_CASELESS_PROPERTIES) {
	276	my $chars_ref = shift @USER_CASELESS_PROPERTIES;
	277	my @in = grep {!/^!./} @$chars_ref;
	278	my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars_ref;
	279	my $in_pat = eval qq ['\\p{$class}'];
	280	my $out_pat = eval qq ['\\P{$class}'];
	281
	282	# Verify works as regularly for not /i
	283	match $_, $in_pat, $out_pat for @in;
	284	match $_, $out_pat, $in_pat for @out;
	285
	286	# Verify that adding /i doesn't change the in set.
	287	match $_, $in_pat, $out_pat, 'i' for @in;
	288
	289	# Verify that adding /i does change the out set to match.
	290	match $_, $in_pat, $out_pat, 'i' for @out;
	291	}
	292	}
	293
	294
	295	#
	296	# User defined properties
	297	#
	298
	299	sub InKana1 {<<'--'}
	300	3040 309F
	301	30A0 30FF
	302	--
	303
	304	sub InKana2 {<<'--'}
	305	+utf8::InHiragana
	306	+utf8::InKatakana
	307	--
	308
	309	sub InKana3 {<<'--'}
	310	+utf8::InHiragana
	311	+utf8::InKatakana
	312	-utf8::IsCn
	313	--
	314
	315	sub InNotKana {<<'--'}
	316	!utf8::InHiragana
	317	-utf8::InKatakana
	318	+utf8::IsCn
	319	--
	320
	321	sub InConsonant {
	322
	323	my $return = "+utf8::Lowercase\n&utf8::ASCII\n";
	324	$return .= sprintf("-%X\n", ord "a");
	325	$return .= sprintf("-%X\n", ord "e");
	326	$return .= sprintf("-%X\n", ord "i");
	327	$return .= sprintf("-%X\n", ord "o");
	328	$return .= sprintf("-%X\n", ord "u");
	329	return $return;
	330	}
	331
	332	sub IsSyriac1 {<<'--'}
	333	0712 072C
	334	0730 074A
	335	--
	336
	337	sub InGreekSmall {return "03B1\t03C9"}
	338	sub InGreekCapital {return "0391\t03A9\n-03A2"}
	339
	340	sub IsAsciiHexAndDash {<<'--'}
	341	+utf8::ASCII_Hex_Digit
	342	+utf8::Dash
	343	--
	344
	345	sub IsMyUpper {
	346	my $caseless = shift;
	347	return "+utf8::"
	348	. (($caseless)
	349	? 'Alphabetic'
	350	: 'Uppercase')
	351	. "\n&utf8::ASCII";
	352	}
	353
	354	{ # This has to be done here and not like the others, because we have to
	355	# make sure that the property is not known until after the regex is
	356	# compiled. It was previously getting confused about the pkg and /i
	357	# combination
	358
	359	my $mylower = qr/\p{pkg::IsMyLower}/i;
	360
	361	sub pkg::IsMyLower {
	362	my $caseless = shift;
	363	return "+utf8::"
	364	. (($caseless)
	365	? 'Alphabetic'
	366	: 'Lowercase')
	367	. "\n&utf8::ASCII";
	368	}
	369
	370	like("A", $mylower, "Not available until runtime user-defined property with pkg:: and /i works");
	371
	372	}
	373
	374	# Verify that can use user-defined properties inside another one
	375	sub IsSyriac1KanaMark {<<'--'}
	376	+main::IsSyriac1
	377	+main::InKana3
	378	&utf8::IsMark
	379	--
	380
	381	# fake user-defined properties; these subs shouldn't be called, because
	382	# their names don't start with In or Is
	383
	384	sub f { die }
	385	sub foo { die }
	386	sub isfoo { die }
	387	sub infoo { die }
	388	sub ISfoo { die }
	389	sub INfoo { die }
	390	sub Is::foo { die }
	391	sub In::foo { die }
	392	__END__