| 1 | #!./perl |
| 2 | # |
| 3 | # Tests that have to do with checking whether characters have (or not have) |
| 4 | # certain Unicode properties; belong (or not belong) to blocks, scripts, etc. |
| 5 | # |
| 6 | |
| 7 | use strict; |
| 8 | use warnings; |
| 9 | use 5.010; |
| 10 | |
| 11 | BEGIN { |
| 12 | chdir 't' if -d 't'; |
| 13 | require './test.pl'; |
| 14 | skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)"); |
| 15 | } |
| 16 | |
| 17 | sub run_tests; |
| 18 | |
| 19 | # |
| 20 | # This is the data to test. |
| 21 | # |
| 22 | # This is a hash; keys are the property to test. |
| 23 | # Values are arrays containing characters to test. The characters can |
| 24 | # have the following formats: |
| 25 | # '\N{CHARACTER NAME}' - Use character with that name |
| 26 | # '\x{1234}' - Use character with that hex escape |
| 27 | # '0x1234' - Use chr() to get that character |
| 28 | # "a" - Character to use |
| 29 | # |
| 30 | # If a character entry starts with ! the character does not belong to the class |
| 31 | # |
| 32 | # If the class is just single letter, we use both \pL and \p{L} |
| 33 | # |
| 34 | |
| 35 | use charnames ':full'; |
| 36 | |
| 37 | my @CLASSES = ( |
| 38 | L => ["a", "A"], |
| 39 | Ll => ["b", "!B"], |
| 40 | Lu => ["!c", "C"], |
| 41 | IsLl => ["d", "!D"], |
| 42 | IsLu => ["!e", "E"], |
| 43 | LC => ["f", "!1"], |
| 44 | 'L&' => ["g", "!2"], |
| 45 | 'Lowercase Letter' => ["h", "!H"], |
| 46 | |
| 47 | Common => ["!i", "3"], |
| 48 | Inherited => ["!j", '\x{300}'], |
| 49 | |
| 50 | InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], |
| 51 | InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], |
| 52 | InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], |
| 53 | InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], |
| 54 | InKatakana => ['\N{KATAKANA LETTER SMALL A}'], |
| 55 | IsLatin => ["0x100", "0x212b"], |
| 56 | IsHebrew => ["0x5d0", "0xfb4f"], |
| 57 | IsGreek => ["0x37a", "0x386", "!0x387", "0x388", |
| 58 | "0x38a", "!0x38b", "0x38c"], |
| 59 | HangulSyllables => ['\x{AC00}'], |
| 60 | 'Script=Latin' => ['\x{0100}'], |
| 61 | 'Block=LatinExtendedA' => ['\x{0100}'], |
| 62 | 'Category=UppercaseLetter' => ['\x{0100}'], |
| 63 | |
| 64 | # |
| 65 | # It's ok to repeat class names. |
| 66 | # |
| 67 | InLatin1Supplement => |
| 68 | ['!\N{U+7f}', '\N{U+80}', '\N{U+ff}', '!\x{100}'], |
| 69 | InLatinExtendedA => |
| 70 | ['!\N{U+7f}', '!\N{U+80}', '!\N{U+ff}', '\x{100}'], |
| 71 | |
| 72 | # |
| 73 | # Properties are case-insensitive, and may have whitespace, |
| 74 | # dashes and underscores. |
| 75 | # |
| 76 | 'in-latin1_SUPPLEMENT' => ['\N{U+80}', |
| 77 | '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], |
| 78 | ' ^ In Latin 1 Supplement ' |
| 79 | => ['!\N{U+80}', '\N{COFFIN}'], |
| 80 | 'latin-1 supplement' => ['\N{U+80}', "0xDF"], |
| 81 | |
| 82 | ); |
| 83 | |
| 84 | my @USER_DEFINED_PROPERTIES = ( |
| 85 | # |
| 86 | # User defined properties |
| 87 | # |
| 88 | InKana1 => ['\x{3040}', '!\x{303F}'], |
| 89 | InKana2 => ['\x{3040}', '!\x{303F}'], |
| 90 | InKana3 => ['\x{3041}', '!\x{3040}'], |
| 91 | InNotKana => ['\x{3040}', '!\x{3041}'], |
| 92 | InConsonant => ['d', '!e'], |
| 93 | IsSyriac1 => ['\x{0712}', '!\x{072F}'], |
| 94 | IsSyriac1KanaMark => ['\x{309A}', '!\x{3090}'], |
| 95 | IsSyriac1KanaMark => ['\x{0730}', '!\x{0712}'], |
| 96 | '# User-defined character properties may lack \n at the end', |
| 97 | InGreekSmall => ['\N{GREEK SMALL LETTER PI}', |
| 98 | '\N{GREEK SMALL LETTER FINAL SIGMA}'], |
| 99 | InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], |
| 100 | Dash => ['-'], |
| 101 | ASCII_Hex_Digit => ['!-', 'A'], |
| 102 | IsAsciiHexAndDash => ['-', 'A'], |
| 103 | ); |
| 104 | |
| 105 | my @USER_CASELESS_PROPERTIES = ( |
| 106 | # |
| 107 | # User defined properties which differ depending on /i. Second entry is |
| 108 | # false normally, true under /i |
| 109 | # |
| 110 | 'IsMyUpper' => ["M", "!m" ], |
| 111 | ); |
| 112 | |
| 113 | |
| 114 | # |
| 115 | # From the short properties we populate POSIX-like classes. |
| 116 | # |
| 117 | my %SHORT_PROPERTIES = ( |
| 118 | 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], |
| 119 | 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], |
| 120 | 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], |
| 121 | # is also in other alphabetic |
| 122 | 'Mn' => ['\N{HEBREW POINT RAFE}'], |
| 123 | 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], |
| 124 | 'Pc' => ["_"], |
| 125 | 'Po' => ["!"], |
| 126 | 'Zs' => [" "], |
| 127 | 'Cc' => ['\x{00}'], |
| 128 | ); |
| 129 | |
| 130 | # |
| 131 | # Illegal properties |
| 132 | # |
| 133 | my @ILLEGAL_PROPERTIES = |
| 134 | qw[q qrst f foo isfoo infoo ISfoo INfoo Is::foo In::foo]; |
| 135 | |
| 136 | my %d; |
| 137 | |
| 138 | while (my ($class, $chars) = each %SHORT_PROPERTIES) { |
| 139 | push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; |
| 140 | push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; |
| 141 | push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' |
| 142 | ? $_ : "!$_"} @$chars; |
| 143 | push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; |
| 144 | push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; |
| 145 | push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; |
| 146 | push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ |
| 147 | ? $_ : "!$_"} @$chars; |
| 148 | push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ |
| 149 | ? $_ : "!$_"} @$chars; |
| 150 | push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; |
| 151 | push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; |
| 152 | push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; |
| 153 | push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" |
| 154 | ? $_ : "!$_"} @$chars; |
| 155 | push @{$d {IsSpace}} => map {$class =~ /^Z/ || |
| 156 | length ($_) == 1 && utf8::native_to_unicode(ord ($_)) >= 0x09 |
| 157 | && utf8::native_to_unicode(ord ($_)) <= 0x0D |
| 158 | ? $_ : "!$_"} @$chars; |
| 159 | } |
| 160 | |
| 161 | push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, |
| 162 | "# POSIX like properties" => %d, |
| 163 | "# User defined properties" => @USER_DEFINED_PROPERTIES; |
| 164 | |
| 165 | |
| 166 | # |
| 167 | # Calculate the number of tests. |
| 168 | # |
| 169 | my $count = 0; |
| 170 | for (my $i = 0; $i < @CLASSES; $i += 2) { |
| 171 | $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; |
| 172 | $count += 2 * (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; |
| 173 | } |
| 174 | $count += 4 * @ILLEGAL_PROPERTIES; |
| 175 | $count += 4 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; |
| 176 | $count += 8 * @USER_CASELESS_PROPERTIES; |
| 177 | $count += 1; # Test for pkg:IsMyLower |
| 178 | |
| 179 | plan(tests => $count); |
| 180 | |
| 181 | run_tests unless caller (); |
| 182 | |
| 183 | sub match { |
| 184 | my ($char, $match, $nomatch, $caseless) = @_; |
| 185 | $caseless = "" unless defined $caseless; |
| 186 | $caseless = 'i' if $caseless; |
| 187 | |
| 188 | my ($str, $name); |
| 189 | |
| 190 | if ($char =~ /^\\/) { |
| 191 | $str = eval qq ["$char"]; |
| 192 | $name = qq ["$char"]; |
| 193 | } |
| 194 | elsif ($char =~ /^0x([0-9A-Fa-f]+)$/) { |
| 195 | $str = chr hex $1; |
| 196 | $name = "chr ($char)"; |
| 197 | } |
| 198 | else { |
| 199 | $str = $char; |
| 200 | $name = qq ["$char"]; |
| 201 | } |
| 202 | |
| 203 | undef $@; |
| 204 | my $pat = "qr/$match/$caseless"; |
| 205 | my $match_pat = eval $pat; |
| 206 | is($@, '', "$pat compiled correctly to a regexp: $@"); |
| 207 | like($str, $match_pat, "$name correctly matched"); |
| 208 | |
| 209 | undef $@; |
| 210 | $pat = "qr/$nomatch/$caseless"; |
| 211 | my $nomatch_pat = eval $pat; |
| 212 | is($@, '', "$pat compiled correctly to a regexp: $@"); |
| 213 | unlike($str, $nomatch_pat, "$name correctly did not match"); |
| 214 | } |
| 215 | |
| 216 | sub run_tests { |
| 217 | |
| 218 | while (@CLASSES) { |
| 219 | my $class = shift @CLASSES; |
| 220 | if ($class =~ /^\h*#\h*(.*)/) { |
| 221 | print "# $1\n"; |
| 222 | next; |
| 223 | } |
| 224 | last unless @CLASSES; |
| 225 | my $chars = shift @CLASSES; |
| 226 | my @in = grep {!/^!./} @$chars; |
| 227 | my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; |
| 228 | my $in_pat = eval qq ['\\p{$class}']; |
| 229 | my $out_pat = eval qq ['\\P{$class}']; |
| 230 | |
| 231 | match $_, $in_pat, $out_pat for @in; |
| 232 | match $_, $out_pat, $in_pat for @out; |
| 233 | |
| 234 | if (1 == length $class) { # Repeat without braces if name length 1 |
| 235 | my $in_pat = eval qq ['\\p$class']; |
| 236 | my $out_pat = eval qq ['\\P$class']; |
| 237 | |
| 238 | match $_, $in_pat, $out_pat for @in; |
| 239 | match $_, $out_pat, $in_pat for @out; |
| 240 | } |
| 241 | } |
| 242 | |
| 243 | |
| 244 | print "# Illegal properties\n"; |
| 245 | foreach my $p (@ILLEGAL_PROPERTIES) { |
| 246 | my $pat; |
| 247 | if ($p =~ /::/) { |
| 248 | $pat = qr /^Illegal user-defined property name/; |
| 249 | } |
| 250 | else { |
| 251 | $pat = qr /^Can't find Unicode property definition/; |
| 252 | } |
| 253 | |
| 254 | undef $@; |
| 255 | my $r = eval "'a' =~ /\\p{$p}/; 1"; |
| 256 | is($r, undef, "Unknown Unicode property \\p{$p}"); |
| 257 | like($@, $pat, "Unknown Unicode property \\p{$p}"); |
| 258 | undef $@; |
| 259 | my $s = eval "'a' =~ /\\P{$p}/; 1"; |
| 260 | is($s, undef, "Unknown Unicode property \\p{$p}"); |
| 261 | like($@, $pat, "Unknown Unicode property \\p{$p}"); |
| 262 | if (length $p == 1) { |
| 263 | undef $@; |
| 264 | my $r = eval "'a' =~ /\\p$p/; 1"; |
| 265 | is($r, undef, "Unknown Unicode property \\p$p"); |
| 266 | like($@, $pat, "Unknown Unicode property \\p$p"); |
| 267 | undef $@; |
| 268 | my $s = eval "'a' =~ /\\P$p/; 1"; |
| 269 | is($r, undef, "Unknown Unicode property \\P$p"); |
| 270 | like($@, $pat, "Unknown Unicode property \\P$p"); |
| 271 | } |
| 272 | } |
| 273 | |
| 274 | print "# User-defined properties with /i differences\n"; |
| 275 | while (my $class = shift @USER_CASELESS_PROPERTIES) { |
| 276 | my $chars_ref = shift @USER_CASELESS_PROPERTIES; |
| 277 | my @in = grep {!/^!./} @$chars_ref; |
| 278 | my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars_ref; |
| 279 | my $in_pat = eval qq ['\\p{$class}']; |
| 280 | my $out_pat = eval qq ['\\P{$class}']; |
| 281 | |
| 282 | # Verify works as regularly for not /i |
| 283 | match $_, $in_pat, $out_pat for @in; |
| 284 | match $_, $out_pat, $in_pat for @out; |
| 285 | |
| 286 | # Verify that adding /i doesn't change the in set. |
| 287 | match $_, $in_pat, $out_pat, 'i' for @in; |
| 288 | |
| 289 | # Verify that adding /i does change the out set to match. |
| 290 | match $_, $in_pat, $out_pat, 'i' for @out; |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | |
| 295 | # |
| 296 | # User defined properties |
| 297 | # |
| 298 | |
| 299 | sub InKana1 {<<'--'} |
| 300 | 3040 309F |
| 301 | 30A0 30FF |
| 302 | -- |
| 303 | |
| 304 | sub InKana2 {<<'--'} |
| 305 | +utf8::InHiragana |
| 306 | +utf8::InKatakana |
| 307 | -- |
| 308 | |
| 309 | sub InKana3 {<<'--'} |
| 310 | +utf8::InHiragana |
| 311 | +utf8::InKatakana |
| 312 | -utf8::IsCn |
| 313 | -- |
| 314 | |
| 315 | sub InNotKana {<<'--'} |
| 316 | !utf8::InHiragana |
| 317 | -utf8::InKatakana |
| 318 | +utf8::IsCn |
| 319 | -- |
| 320 | |
| 321 | sub InConsonant { |
| 322 | |
| 323 | my $return = "+utf8::Lowercase\n&utf8::ASCII\n"; |
| 324 | $return .= sprintf("-%X\n", ord "a"); |
| 325 | $return .= sprintf("-%X\n", ord "e"); |
| 326 | $return .= sprintf("-%X\n", ord "i"); |
| 327 | $return .= sprintf("-%X\n", ord "o"); |
| 328 | $return .= sprintf("-%X\n", ord "u"); |
| 329 | return $return; |
| 330 | } |
| 331 | |
| 332 | sub IsSyriac1 {<<'--'} |
| 333 | 0712 072C |
| 334 | 0730 074A |
| 335 | -- |
| 336 | |
| 337 | sub InGreekSmall {return "03B1\t03C9"} |
| 338 | sub InGreekCapital {return "0391\t03A9\n-03A2"} |
| 339 | |
| 340 | sub IsAsciiHexAndDash {<<'--'} |
| 341 | +utf8::ASCII_Hex_Digit |
| 342 | +utf8::Dash |
| 343 | -- |
| 344 | |
| 345 | sub IsMyUpper { |
| 346 | my $caseless = shift; |
| 347 | return "+utf8::" |
| 348 | . (($caseless) |
| 349 | ? 'Alphabetic' |
| 350 | : 'Uppercase') |
| 351 | . "\n&utf8::ASCII"; |
| 352 | } |
| 353 | |
| 354 | { # This has to be done here and not like the others, because we have to |
| 355 | # make sure that the property is not known until after the regex is |
| 356 | # compiled. It was previously getting confused about the pkg and /i |
| 357 | # combination |
| 358 | |
| 359 | my $mylower = qr/\p{pkg::IsMyLower}/i; |
| 360 | |
| 361 | sub pkg::IsMyLower { |
| 362 | my $caseless = shift; |
| 363 | return "+utf8::" |
| 364 | . (($caseless) |
| 365 | ? 'Alphabetic' |
| 366 | : 'Lowercase') |
| 367 | . "\n&utf8::ASCII"; |
| 368 | } |
| 369 | |
| 370 | like("A", $mylower, "Not available until runtime user-defined property with pkg:: and /i works"); |
| 371 | |
| 372 | } |
| 373 | |
| 374 | # Verify that can use user-defined properties inside another one |
| 375 | sub IsSyriac1KanaMark {<<'--'} |
| 376 | +main::IsSyriac1 |
| 377 | +main::InKana3 |
| 378 | &utf8::IsMark |
| 379 | -- |
| 380 | |
| 381 | # fake user-defined properties; these subs shouldn't be called, because |
| 382 | # their names don't start with In or Is |
| 383 | |
| 384 | sub f { die } |
| 385 | sub foo { die } |
| 386 | sub isfoo { die } |
| 387 | sub infoo { die } |
| 388 | sub ISfoo { die } |
| 389 | sub INfoo { die } |
| 390 | sub Is::foo { die } |
| 391 | sub In::foo { die } |
| 392 | __END__ |