perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!perl -w
	2
	3	BEGIN {
	4	require 'loc_tools.pl'; # Contains locales_enabled() and
	5	# find_utf8_ctype_locale()
	6	}
	7
	8	use strict;
	9	use Test::More;
	10	use Config;
	11
	12	use XS::APItest;
	13
	14	my $tab = " " x 4; # Indent subsidiary tests this much
	15
	16	use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
	17	my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
	18
	19	sub get_charname($) {
	20	my $cp = shift;
	21
	22	# If there is a an abbreviation for the code point name, use it
	23	my $name_index = search_invlist(\@{$charname_list}, $cp);
	24	if (defined $name_index) {
	25	my $synonyms = $charname_map->[$name_index];
	26	if (ref $synonyms) {
	27	my $pat = qr/: abbreviation/;
	28	my @abbreviations = grep { $_ =~ $pat } @$synonyms;
	29	if (@abbreviations) {
	30	return $abbreviations[0] =~ s/$pat//r;
	31	}
	32	}
	33	}
	34
	35	# Otherwise, use the full name
	36	use charnames ();
	37	return charnames::viacode($cp) // "No name";
	38	}
	39
	40	sub truth($) { # Converts values so is() works
	41	return (shift) ? 1 : 0;
	42	}
	43
	44	my $base_locale;
	45	my $utf8_locale;
	46	if(locales_enabled('LC_ALL')) {
	47	require POSIX;
	48	$base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
	49	if (defined $base_locale && $base_locale eq 'C') {
	50	use locale; # make \w work right in non-ASCII lands
	51
	52	# Some locale implementations don't have the 128-255 characters all
	53	# mean nothing. Skip the locale tests in that situation
	54	for my $i (128 .. 255) {
	55	if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
	56	undef $base_locale;
	57	last;
	58	}
	59	}
	60
	61	$utf8_locale = find_utf8_ctype_locale() if $base_locale;
	62	}
	63	}
	64
	65	sub get_display_locale_or_skip($$) {
	66
	67	# Helper function intimately tied to its callers. It knows the loop
	68	# iterates with a locale of "", meaning don't use locale; $base_locale
	69	# meaning to use a non-UTF-8 locale; and $utf8_locale.
	70	#
	71	# It checks to see if the current test should be skipped or executed,
	72	# returning an empty list for the former, and for the latter:
	73	# ( 'locale display name',
	74	# bool of is this a UTF-8 locale )
	75	#
	76	# The display name is the empty string if not using locale. Functions
	77	# with _LC in their name are skipped unless in locale, and functions
	78	# without _LC are executed only outside locale.
	79
	80	my ($locale, $suffix) = @_;
	81
	82	# The test should be skipped if the input is for a non-existent locale
	83	return unless defined $locale;
	84
	85	# Here the input is defined, either a locale name or "". If the test is
	86	# for not using locales, we want to do the test for non-LC functions,
	87	# and skip it for LC ones.
	88	if ($locale eq "") {
	89	return ("", 0) if $suffix !~ /LC/;
	90	return;
	91	}
	92
	93	# Here the input is for a real locale. We don't test the non-LC functions
	94	# for locales.
	95	return if $suffix !~ /LC/;
	96
	97	# Here is for a LC function and a real locale. The base locale is not
	98	# UTF-8.
	99	return (" ($locale locale)", 0) if $locale eq $base_locale;
	100
	101	# The only other possibility is that we have a UTF-8 locale
	102	return (" ($locale)", 1);
	103	}
	104
	105	sub try_malforming($$$)
	106	{
	107	# Determines if the tests for malformed UTF-8 should be done. When done,
	108	# the .xs code creates malformations by pretending the length is shorter
	109	# than it actually is. Some things can't be malformed, and sometimes this
	110	# test knows that the current code doesn't look for a malformation under
	111	# various circumstances.
	112
	113	my ($i, $function, $using_locale) = @_;
	114	# $i is unicode code point;
	115
	116	# Single bytes can't be malformed
	117	return 0 if $i < ((ord "A" == 65) ? 128 : 160);
	118
	119	# ASCII doesn't need to ever look beyond the first byte.
	120	return 0 if $function eq "ASCII";
	121
	122	# No controls above 255, so the code doesn't look at those
	123	return 0 if $i > 255 && $function eq "CNTRL";
	124
	125	# No non-ASCII digits below 256, except if using locales.
	126	return 0 if $i < 256 && ! $using_locale && $function =~ /X?DIGIT/;
	127
	128	return 1;
	129	}
	130
	131	my %properties = (
	132	# name => Lookup-property name
	133	alnum => 'Word',
	134	wordchar => 'Word',
	135	alphanumeric => 'Alnum',
	136	alpha => 'XPosixAlpha',
	137	ascii => 'ASCII',
	138	blank => 'Blank',
	139	cntrl => 'Control',
	140	digit => 'Digit',
	141	graph => 'Graph',
	142	idfirst => '_Perl_IDStart',
	143	idcont => '_Perl_IDCont',
	144	lower => 'XPosixLower',
	145	print => 'Print',
	146	psxspc => 'XPosixSpace',
	147	punct => 'XPosixPunct',
	148	quotemeta => '_Perl_Quotemeta',
	149	space => 'XPerlSpace',
	150	vertws => 'VertSpace',
	151	upper => 'XPosixUpper',
	152	xdigit => 'XDigit',
	153	);
	154
	155	my %seen;
	156	my @warnings;
	157	local $SIG{__WARN__} = sub { push @warnings, @_ };
	158
	159	my %utf8_param_code = (
	160	"_safe" => 0,
	161	"_safe, malformed" => 1,
	162	"deprecated unsafe" => -1,
	163	"deprecated mathoms" => -2,
	164	);
	165
	166	foreach my $name (sort keys %properties, 'octal') {
	167	my @invlist;
	168	if ($name eq 'octal') {
	169	# Hand-roll an inversion list with 0-7 in it and nothing else.
	170	push @invlist, ord "0", ord "8";
	171	}
	172	else {
	173	my $property = $properties{$name};
	174	@invlist = prop_invlist($property, '_perl_core_internal_ok');
	175	if (! @invlist) {
	176
	177	# An empty return could mean an unknown property, or merely that
	178	# it is empty. Call in scalar context to differentiate
	179	if (! prop_invlist($property, '_perl_core_internal_ok')) {
	180	fail("No inversion list found for $property");
	181	next;
	182	}
	183	}
	184	}
	185
	186	# Include all the Latin1 code points, plus 0x100.
	187	my @code_points = (0 .. 256);
	188
	189	# Then include the next few boundaries above those from this property
	190	my $above_latins = 0;
	191	foreach my $range_start (@invlist) {
	192	next if $range_start < 257;
	193	push @code_points, $range_start - 1, $range_start;
	194	$above_latins++;
	195	last if $above_latins > 5;
	196	}
	197
	198	# This makes sure we are using the Perl definition of idfirst and idcont,
	199	# and not the Unicode. There are a few differences.
	200	push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first\|cont)/;
	201	if ($name eq "idcont") { # And some that are continuation but not start
	202	push @code_points, ord("\N{GREEK ANO TELEIA}"),
	203	ord("\N{COMBINING GRAVE ACCENT}");
	204	}
	205
	206	# And finally one non-Unicode code point.
	207	push @code_points, 0x110000; # Above Unicode, no prop should match
	208	no warnings 'non_unicode';
	209
	210	for my $j (@code_points) {
	211	my $i = utf8::native_to_unicode($j);
	212	my $function = uc($name);
	213
	214	is (@warnings, 0, "Got no unexpected warnings in previous iteration")
	215	or diag("@warnings");
	216	undef @warnings;
	217
	218	my $matches = search_invlist(\@invlist, $j);
	219	if (! defined $matches) {
	220	$matches = 0;
	221	}
	222	else {
	223	$matches = truth(! ($matches % 2));
	224	}
	225
	226	my $ret;
	227	my $char_name = get_charname($j);
	228	my $display_name = sprintf "\\x{%02X, %s}", $j, $char_name;
	229	my $display_call = "is${function}( $display_name )";
	230
	231	foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
	232	"_LC_uvchr", "_utf8", "_LC_utf8")
	233	{
	234
	235	# Not all possible macros have been defined
	236	if ($name eq 'vertws') {
	237
	238	# vertws is always all of Unicode
	239	next if $suffix !~ / ^ _ ( uni \| uvchr \| utf8 ) $ /x;
	240	}
	241	elsif ($name eq 'alnum') {
	242
	243	# ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
	244	# suffixes were added later, after WORDCHAR was created to be
	245	# a clearer synonym for ALNUM
	246	next if $suffix eq '_A'
	247	\|\| $suffix eq '_L1'
	248	\|\| $suffix eq '_uvchr';
	249	}
	250	elsif ($name eq 'octal') {
	251	next if $suffix ne "" && $suffix ne '_A' && $suffix ne '_L1';
	252	}
	253	elsif ($name eq 'quotemeta') {
	254	# There is only one macro for this, and is defined only for
	255	# Latin1 range
	256	next if $suffix ne ""
	257	}
	258
	259	foreach my $locale ("", $base_locale, $utf8_locale) {
	260
	261	my ($display_locale, $locale_is_utf8)
	262	= get_display_locale_or_skip($locale, $suffix);
	263	next unless defined $display_locale;
	264
	265	use if $locale, "locale";
	266	POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
	267
	268	if ($suffix !~ /utf8/) { # _utf8 has to handled specially
	269	my $display_call
	270	= "is${function}$suffix( $display_name )$display_locale";
	271	$ret = truth eval "test_is${function}$suffix($j)";
	272	if (is ($@, "", "$display_call didn't give error")) {
	273	my $truth = $matches;
	274	if ($truth) {
	275
	276	# The single byte functions are false for
	277	# above-Latin1
	278	if ($j >= 256) {
	279	$truth = 0
	280	if $suffix=~ / ^ ( _A \| _L [1C] )? $ /x;
	281	}
	282	elsif ( $i >= 128
	283	&& $name ne 'quotemeta')
	284	{
	285
	286	# The no-suffix and _A functions are false
	287	# for non-ASCII. So are _LC functions on a
	288	# non-UTF-8 locale
	289	$truth = 0 if $suffix eq "_A"
	290	\|\| $suffix eq ""
	291	\|\| ( $suffix =~ /LC/
	292	&& ! $locale_is_utf8);
	293	}
	294	}
	295
	296	is ($ret, $truth, "${tab}And correctly returns $truth");
	297	}
	298	}
	299	else { # _utf8 suffix
	300	my $char = chr($j);
	301	utf8::upgrade($char);
	302	$char = quotemeta $char if $char eq '\\' \|\| $char eq "'";
	303	my $truth;
	304	if ( $suffix =~ /LC/
	305	&& ! $locale_is_utf8
	306	&& $j < 256
	307	&& $i >= 128)
	308	{ # The C-locale _LC function returns FALSE for Latin1
	309	# above ASCII
	310	$truth = 0;
	311	}
	312	else {
	313	$truth = $matches;
	314	}
	315
	316	foreach my $utf8_param("_safe",
	317	"_safe, malformed",
	318	"deprecated unsafe"
	319	)
	320	{
	321	my $utf8_param_code = $utf8_param_code{$utf8_param};
	322	my $expect_error = $utf8_param_code > 0;
	323	next if $expect_error
	324	&& ! try_malforming($i, $function,
	325	$suffix =~ /LC/);
	326
	327	my $display_call = "is${function}$suffix( $display_name"
	328	. ", $utf8_param )$display_locale";
	329	$ret = truth eval "test_is${function}$suffix('$char',"
	330	. " $utf8_param_code)";
	331	if ($expect_error) {
	332	isnt ($@, "",
	333	"expected and got error in $display_call");
	334	like($@, qr/Malformed UTF-8 character/,
	335	"${tab}And got expected message");
	336	if (is (@warnings, 1,
	337	"${tab}Got a single warning besides"))
	338	{
	339	like($warnings[0],
	340	qr/Malformed UTF-8 character.*short/,
	341	"${tab}Got expected warning");
	342	}
	343	else {
	344	diag("@warnings");
	345	}
	346	undef @warnings;
	347	}
	348	elsif (is ($@, "", "$display_call didn't give error")) {
	349	is ($ret, $truth,
	350	"${tab}And correctly returned $truth");
	351	if ($utf8_param_code < 0) {
	352	my $warnings_ok;
	353	my $unique_function = "is" . $function . $suffix;
	354	if (! $seen{$unique_function}++) {
	355	$warnings_ok = is(@warnings, 1,
	356	"${tab}This is first call to"
	357	. " $unique_function; Got a single"
	358	. " warning");
	359	if ($warnings_ok) {
	360	$warnings_ok = like($warnings[0],
	361	qr/starting in Perl .* will require an additional parameter/,
	362	"${tab}The warning was the expected"
	363	. " deprecation one");
	364	}
	365	}
	366	else {
	367	$warnings_ok = is(@warnings, 0,
	368	"${tab}This subsequent call to"
	369	. " $unique_function did not warn");
	370	}
	371	$warnings_ok or diag("@warnings");
	372	undef @warnings;
	373	}
	374	}
	375	}
	376	}
	377	}
	378	}
	379	}
	380	}
	381
	382	my %to_properties = (
	383	FOLD => 'Case_Folding',
	384	LOWER => 'Lowercase_Mapping',
	385	TITLE => 'Titlecase_Mapping',
	386	UPPER => 'Uppercase_Mapping',
	387	);
	388
	389
	390	foreach my $name (sort keys %to_properties) {
	391	my $property = $to_properties{$name};
	392	my ($list_ref, $map_ref, $format, $missing)
	393	= prop_invmap($property, );
	394	if (! $list_ref \|\| ! $map_ref) {
	395	fail("No inversion map found for $property");
	396	next;
	397	}
	398	if ($format !~ / ^ a l? $ /x) {
	399	fail("Unexpected inversion map format ('$format') found for $property");
	400	next;
	401	}
	402
	403	# Include all the Latin1 code points, plus 0x100.
	404	my @code_points = (0 .. 256);
	405
	406	# Then include the next few multi-char folds above those from this
	407	# property, and include the next few single folds as well
	408	my $above_latins = 0;
	409	my $multi_char = 0;
	410	for my $i (0 .. @{$list_ref} - 1) {
	411	my $range_start = $list_ref->[$i];
	412	next if $range_start < 257;
	413	if (ref $map_ref->[$i] && $multi_char < 5) {
	414	push @code_points, $range_start - 1
	415	if $code_points[-1] != $range_start - 1;
	416	push @code_points, $range_start;
	417	$multi_char++;
	418	}
	419	elsif ($above_latins < 5) {
	420	push @code_points, $range_start - 1
	421	if $code_points[-1] != $range_start - 1;
	422	push @code_points, $range_start;
	423	$above_latins++;
	424	}
	425	last if $above_latins >= 5 && $multi_char >= 5;
	426	}
	427
	428	# And finally one non-Unicode code point.
	429	push @code_points, 0x110000; # Above Unicode, no prop should match
	430	no warnings 'non_unicode';
	431
	432	# $j is native; $i unicode.
	433	for my $j (@code_points) {
	434	my $i = utf8::native_to_unicode($j);
	435	my $function = $name;
	436
	437	my $index = search_invlist(\@{$list_ref}, $j);
	438
	439	my $ret;
	440	my $char_name = get_charname($j);
	441	my $display_name = sprintf "\\N{U+%02X, %s}", $j, $char_name;
	442
	443	foreach my $suffix ("", "_L1", "_LC") {
	444
	445	# This is the only macro defined for L1
	446	next if $suffix eq "_L1" && $function ne "LOWER";
	447
	448	SKIP:
	449	foreach my $locale ("", $base_locale, $utf8_locale) {
	450
	451	# titlecase is not defined in locales.
	452	next if $name eq 'TITLE' && $suffix eq "_LC";
	453
	454	my ($display_locale, $locale_is_utf8)
	455	= get_display_locale_or_skip($locale, $suffix);
	456	next unless defined $display_locale;
	457
	458	skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
	459	. "$display_locale", 1)
	460	if $i == 0xDF && $name =~ / FOLD \| UPPER /x
	461	&& $suffix eq "_LC" && $locale_is_utf8;
	462
	463	use if $locale, "locale";
	464	POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
	465
	466	my $display_call = "to${function}$suffix("
	467	. " $display_name )$display_locale";
	468	$ret = eval "test_to${function}$suffix($j)";
	469	if (is ($@, "", "$display_call didn't give error")) {
	470	my $should_be;
	471	if ($j > 255) {
	472	$should_be = $j;
	473	}
	474	elsif ( $i > 127
	475	&& ( $suffix eq ""
	476	\|\| ($suffix eq "_LC" && ! $locale_is_utf8)))
	477	{
	478	$should_be = $j;
	479	}
	480	elsif ($map_ref->[$index] != $missing) {
	481	$should_be = $map_ref->[$index] + $j - $list_ref->[$index]
	482	}
	483	else {
	484	$should_be = $j;
	485	}
	486
	487	is ($ret, $should_be,
	488	sprintf("${tab}And correctly returned 0x%02X",
	489	$should_be));
	490	}
	491	}
	492	}
	493
	494	# The _uni, uvchr, and _utf8 functions return both the ordinal of the
	495	# first code point of the result, and the result in utf8. The .xs
	496	# tests return these in an array, in [0] and [1] respectively, with
	497	# [2] the length of the utf8 in bytes.
	498	my $utf8_should_be = "";
	499	my $first_ord_should_be;
	500	if (ref $map_ref->[$index]) { # A multi-char result
	501	for my $j (0 .. @{$map_ref->[$index]} - 1) {
	502	$utf8_should_be .= chr $map_ref->[$index][$j];
	503	}
	504
	505	$first_ord_should_be = $map_ref->[$index][0];
	506	}
	507	else { # A single-char result
	508	$first_ord_should_be = ($map_ref->[$index] != $missing)
	509	? $map_ref->[$index] + $j
	510	- $list_ref->[$index]
	511	: $j;
	512	$utf8_should_be = chr $first_ord_should_be;
	513	}
	514	utf8::upgrade($utf8_should_be);
	515
	516	# Test _uni, uvchr
	517	foreach my $suffix ('_uni', '_uvchr') {
	518	my $s;
	519	my $len;
	520	my $display_call = "to${function}$suffix( $display_name )";
	521	$ret = eval "test_to${function}$suffix($j)";
	522	if (is ($@, "", "$display_call didn't give error")) {
	523	is ($ret->[0], $first_ord_should_be,
	524	sprintf("${tab}And correctly returned 0x%02X",
	525	$first_ord_should_be));
	526	is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
	527	use bytes;
	528	is ($ret->[2], length $utf8_should_be,
	529	"${tab}Got correct number of bytes for utf8 length");
	530	}
	531	}
	532
	533	# Test _utf8
	534	my $char = chr($j);
	535	utf8::upgrade($char);
	536	$char = quotemeta $char if $char eq '\\' \|\| $char eq "'";
	537	foreach my $utf8_param("_safe",
	538	"_safe, malformed",
	539	"deprecated unsafe",
	540	"deprecated mathoms",
	541	)
	542	{
	543	use Config;
	544	next if $utf8_param eq 'deprecated mathoms'
	545	&& $Config{'ccflags'} =~ /-DNO_MATHOMS/;
	546
	547	my $utf8_param_code = $utf8_param_code{$utf8_param};
	548	my $expect_error = $utf8_param_code > 0;
	549
	550	# Skip if can't malform (because is a UTF-8 invariant)
	551	next if $expect_error && $i < ((ord "A" == 65) ? 128 : 160);
	552
	553	my $display_call = "to${function}_utf8($display_name, $utf8_param )";
	554	$ret = eval "test_to${function}_utf8('$char', $utf8_param_code)";
	555	if ($expect_error) {
	556	isnt ($@, "", "expected and got error in $display_call");
	557	like($@, qr/Malformed UTF-8 character/,
	558	"${tab}And got expected message");
	559	undef @warnings;
	560	}
	561	elsif (is ($@, "", "$display_call didn't give error")) {
	562	is ($ret->[0], $first_ord_should_be,
	563	sprintf("${tab}And correctly returned 0x%02X",
	564	$first_ord_should_be));
	565	is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
	566	use bytes;
	567	is ($ret->[2], length $utf8_should_be,
	568	"${tab}Got correct number of bytes for utf8 length");
	569	if ($utf8_param_code < 0) {
	570	my $warnings_ok;
	571	if (! $seen{"${function}_utf8$utf8_param"}++) {
	572	$warnings_ok = is(@warnings, 1,
	573	"${tab}Got a single warning");
	574	if ($warnings_ok) {
	575	my $expected;
	576	if ($utf8_param_code == -2) {
	577	my $lc_func = lc $function;
	578	$expected
	579	= qr/starting in Perl .* to_utf8_$lc_func will be removed/;
	580	}
	581	else {
	582	$expected
	583	= qr/starting in Perl .* will require an additional parameter/;
	584	}
	585	$warnings_ok = like($warnings[0], $expected,
	586	"${tab}Got expected deprecation warning");
	587	}
	588	}
	589	else {
	590	$warnings_ok = is(@warnings, 0,
	591	"${tab}Deprecation warned only the one time");
	592	}
	593	$warnings_ok or diag("@warnings");
	594	undef @warnings;
	595	}
	596	}
	597	}
	598	}
	599	}
	600
	601	# This is primarily to make sure that no non-Unicode warnings get generated
	602	is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
	603	or diag @warnings;
	604
	605	done_testing;