This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
More tweakage on the Unicode character class descriptions.
[perl5.git] / t / pragma / utf8.t
CommitLineData
f96ec2a2
GS
1#!./perl
2
3BEGIN {
4 chdir 't' if -d 't';
20822f61 5 @INC = '../lib';
f96ec2a2 6 $ENV{PERL5LIB} = '../lib';
f70c35af
GS
7 if ( ord("\t") != 9 ) { # skip on ebcdic platforms
8 print "1..0 # Skip utf8 tests on ebcdic platform.\n";
9 exit;
10 }
f96ec2a2
GS
11}
12
3baa4c62 13print "1..109\n";
f96ec2a2
GS
14
15my $test = 1;
16
17sub ok {
18 my ($got,$expect) = @_;
19 print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
20 print "ok $test\n";
21}
22
7bbb0251
JH
23sub nok {
24 my ($got,$expect) = @_;
25 print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
26 print "ok $test\n";
27}
28
be341bce
GS
29sub ok_bytes {
30 use bytes;
31 my ($got,$expect) = @_;
32 print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
33 print "ok $test\n";
34}
35
7bbb0251
JH
36sub nok_bytes {
37 use bytes;
38 my ($got,$expect) = @_;
39 print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
40 print "ok $test\n";
41}
be341bce 42
f96ec2a2
GS
43{
44 use utf8;
ffc61ed2 45
f96ec2a2
GS
46 $_ = ">\x{263A}<";
47 s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg;
48 ok $_, '>&#9786;<';
c5cc3500 49 $test++; # 1
f96ec2a2
GS
50
51 $_ = ">\x{263A}<";
52 my $rx = "\x{80}-\x{10ffff}";
53 s/([$rx])/"&#".ord($1).";"/eg;
54 ok $_, '>&#9786;<';
c5cc3500 55 $test++; # 2
f96ec2a2
GS
56
57 $_ = ">\x{263A}<";
58 my $rx = "\\x{80}-\\x{10ffff}";
59 s/([$rx])/"&#".ord($1).";"/eg;
60 ok $_, '>&#9786;<';
c5cc3500 61 $test++; # 3
b8c5462f
JH
62
63 $_ = "alpha,numeric";
64 m/([[:alpha:]]+)/;
65 ok $1, 'alpha';
c5cc3500 66 $test++; # 4
b8c5462f
JH
67
68 $_ = "alphaNUMERICstring";
69 m/([[:^lower:]]+)/;
70 ok $1, 'NUMERIC';
c5cc3500 71 $test++; # 5
b8c5462f
JH
72
73 $_ = "alphaNUMERICstring";
74 m/(\p{Ll}+)/;
75 ok $1, 'alpha';
c5cc3500 76 $test++; # 6
b8c5462f
JH
77
78 $_ = "alphaNUMERICstring";
79 m/(\p{Lu}+)/;
80 ok $1, 'NUMERIC';
c5cc3500 81 $test++; # 7
b8c5462f
JH
82
83 $_ = "alpha,numeric";
84 m/([\p{IsAlpha}]+)/;
85 ok $1, 'alpha';
c5cc3500 86 $test++; # 8
b8c5462f
JH
87
88 $_ = "alphaNUMERICstring";
89 m/([^\p{IsLower}]+)/;
90 ok $1, 'NUMERIC';
c5cc3500 91 $test++; # 9
b8c5462f 92
0f4b6630
JH
93 $_ = "alpha123numeric456";
94 m/([\p{IsDigit}]+)/;
95 ok $1, '123';
c5cc3500 96 $test++; # 10
b8c5462f 97
0f4b6630
JH
98 $_ = "alpha123numeric456";
99 m/([^\p{IsDigit}]+)/;
100 ok $1, 'alpha';
c5cc3500 101 $test++; # 11
b8c5462f 102
0f4b6630
JH
103 $_ = ",123alpha,456numeric";
104 m/([\p{IsAlnum}]+)/;
105 ok $1, '123alpha';
c5cc3500 106 $test++; # 12
0f4b6630 107}
3b5dab68 108
a197cbdd 109{
ffc61ed2
JH
110 # no use utf8 needed
111 $_ = "\x{263A}\x{263A}x\x{263A}y\x{263A}";
112
113 ok length($_), 6; # 13
114 $test++;
a197cbdd 115
ffc61ed2 116 ($a) = m/x(.)/;
a197cbdd 117
ffc61ed2
JH
118 ok length($a), 1; # 14
119 $test++;
a197cbdd 120
ffc61ed2
JH
121 ok length($`), 2; # 15
122 $test++;
123 ok length($&), 2; # 16
124 $test++;
125 ok length($'), 2; # 17
126 $test++;
a197cbdd 127
ffc61ed2
JH
128 ok length($1), 1; # 18
129 $test++;
a197cbdd 130
ffc61ed2
JH
131 ok length($b=$`), 2; # 19
132 $test++;
a197cbdd 133
ffc61ed2
JH
134 ok length($b=$&), 2; # 20
135 $test++;
a197cbdd 136
ffc61ed2
JH
137 ok length($b=$'), 2; # 21
138 $test++;
a197cbdd 139
ffc61ed2
JH
140 ok length($b=$1), 1; # 22
141 $test++;
c5cc3500 142
ffc61ed2
JH
143 ok $a, "\x{263A}"; # 23
144 $test++;
a197cbdd 145
ffc61ed2
JH
146 ok $`, "\x{263A}\x{263A}"; # 24
147 $test++;
a197cbdd 148
ffc61ed2
JH
149 ok $&, "x\x{263A}"; # 25
150 $test++;
a197cbdd 151
ffc61ed2
JH
152 ok $', "y\x{263A}"; # 26
153 $test++;
a197cbdd 154
ffc61ed2
JH
155 ok $1, "\x{263A}"; # 27
156 $test++;
be341bce 157
ffc61ed2
JH
158 ok_bytes $a, "\342\230\272"; # 28
159 $test++;
be341bce 160
ffc61ed2
JH
161 ok_bytes $1, "\342\230\272"; # 29
162 $test++;
be341bce 163
ffc61ed2
JH
164 ok_bytes $&, "x\342\230\272"; # 30
165 $test++;
be341bce 166
a197cbdd 167 {
ffc61ed2
JH
168 use utf8; # required
169 $_ = chr(0x263A) . chr(0x263A) . 'x' . chr(0x263A) . 'y' . chr(0x263A);
170 }
a197cbdd 171
ffc61ed2
JH
172 ok length($_), 6; # 31
173 $test++;
a197cbdd 174
ffc61ed2 175 ($a) = m/x(.)/;
a197cbdd 176
ffc61ed2
JH
177 ok length($a), 1; # 32
178 $test++;
a197cbdd 179
ffc61ed2
JH
180 ok length($`), 2; # 33
181 $test++;
a197cbdd 182
ffc61ed2
JH
183 ok length($&), 2; # 34
184 $test++;
a197cbdd 185
ffc61ed2
JH
186 ok length($'), 2; # 35
187 $test++;
a197cbdd 188
ffc61ed2
JH
189 ok length($1), 1; # 36
190 $test++;
a197cbdd 191
ffc61ed2
JH
192 ok length($b=$`), 2; # 37
193 $test++;
a197cbdd 194
ffc61ed2
JH
195 ok length($b=$&), 2; # 38
196 $test++;
a197cbdd 197
ffc61ed2
JH
198 ok length($b=$'), 2; # 39
199 $test++;
a197cbdd 200
ffc61ed2
JH
201 ok length($b=$1), 1; # 40
202 $test++;
a197cbdd 203
ffc61ed2
JH
204 ok $a, "\x{263A}"; # 41
205 $test++;
a197cbdd 206
ffc61ed2
JH
207 ok $`, "\x{263A}\x{263A}"; # 42
208 $test++;
a197cbdd 209
ffc61ed2
JH
210 ok $&, "x\x{263A}"; # 43
211 $test++;
a197cbdd 212
ffc61ed2
JH
213 ok $', "y\x{263A}"; # 44
214 $test++;
a197cbdd 215
ffc61ed2
JH
216 ok $1, "\x{263A}"; # 45
217 $test++;
a197cbdd 218
ffc61ed2
JH
219 ok_bytes $a, "\342\230\272"; # 46
220 $test++;
a197cbdd 221
ffc61ed2
JH
222 ok_bytes $1, "\342\230\272"; # 47
223 $test++;
a197cbdd 224
ffc61ed2
JH
225 ok_bytes $&, "x\342\230\272"; # 48
226 $test++;
a197cbdd 227
ffc61ed2 228 $_ = "\342\230\272\342\230\272x\342\230\272y\342\230\272";
a197cbdd 229
ffc61ed2
JH
230 ok length($_), 14; # 49
231 $test++;
a197cbdd 232
ffc61ed2 233 ($a) = m/x(.)/;
a197cbdd 234
ffc61ed2
JH
235 ok length($a), 1; # 50
236 $test++;
a197cbdd 237
ffc61ed2
JH
238 ok length($`), 6; # 51
239 $test++;
a197cbdd 240
ffc61ed2
JH
241 ok length($&), 2; # 52
242 $test++;
3b5dab68 243
ffc61ed2
JH
244 ok length($'), 6; # 53
245 $test++;
a197cbdd 246
ffc61ed2
JH
247 ok length($1), 1; # 54
248 $test++;
a197cbdd 249
ffc61ed2
JH
250 ok length($b=$`), 6; # 55
251 $test++;
a197cbdd 252
ffc61ed2
JH
253 ok length($b=$&), 2; # 56
254 $test++;
a197cbdd 255
ffc61ed2
JH
256 ok length($b=$'), 6; # 57
257 $test++;
a197cbdd 258
ffc61ed2
JH
259 ok length($b=$1), 1; # 58
260 $test++;
a197cbdd 261
ffc61ed2
JH
262 ok $a, "\342"; # 59
263 $test++;
a197cbdd 264
ffc61ed2
JH
265 ok $`, "\342\230\272\342\230\272"; # 60
266 $test++;
a197cbdd 267
ffc61ed2
JH
268 ok $&, "x\342"; # 61
269 $test++;
a197cbdd 270
ffc61ed2
JH
271 ok $', "\230\272y\342\230\272"; # 62
272 $test++;
a197cbdd 273
ffc61ed2
JH
274 ok $1, "\342"; # 63
275 $test++;
276}
de35ba6f 277
ffc61ed2
JH
278{
279 use utf8;
de35ba6f 280 ok "\x{ab}" =~ /^\x{ab}$/, 1;
ffc61ed2 281 $test++; # 64
a197cbdd 282}
aaa68c4a
SC
283
284{
285 use utf8;
9aa983d2 286 ok_bytes chr(0x1e2), pack("C*", 0xc7, 0xa2);
ffc61ed2 287 $test++; # 65
aaa68c4a 288}
28cb3359
JH
289
290{
291 use utf8;
292 my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
293 ok "@a", "1234 123 2345";
ffc61ed2 294 $test++; # 66
28cb3359
JH
295}
296
297{
298 use utf8;
299 my $x = chr(123);
300 my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
301 ok "@a", "1234 2345";
ffc61ed2 302 $test++; # 67
28cb3359 303}
31067593 304
7bbb0251 305{
da450f52
JH
306 # bug id 20001009.001
307
89491803
SC
308 my ($a, $b);
309
310 { use bytes; $a = "\xc3\xa4" }
311 { use utf8; $b = "\xe4" } # \xXX must not produce UTF-8
312
313 print "not " if $a eq $b;
ffc61ed2 314 print "ok $test\n"; $test++; # 68
89491803
SC
315
316 { use utf8; print "not " if $a eq $b; }
ffc61ed2 317 print "ok $test\n"; $test++; # 69
7bbb0251 318}
31067593
JH
319
320{
da450f52
JH
321 # bug id 20001008.001
322
31067593
JH
323 my @x = ("stra\337e 138","stra\337e 138");
324 for (@x) {
325 s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
326 my($latin) = /^(.+)(?:\s+\d)/;
ffc61ed2 327 print $latin eq "stra\337e" ? "ok $test\n" : # 70, 71
31067593
JH
328 "#latin[$latin]\nnot ok $test\n";
329 $test++;
330 $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
331 use utf8;
332 $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
333 }
334}
b7018214
JH
335
336{
da450f52
JH
337 # bug id 20000427.003
338
339 use utf8;
340 use warnings;
341 use strict;
342
343 my $sushi = "\x{b36c}\x{5a8c}\x{ff5b}\x{5079}\x{505b}";
344
345 my @charlist = split //, $sushi;
346 my $r = '';
347 foreach my $ch (@charlist) {
348 $r = $r . " " . sprintf "U+%04X", ord($ch);
349 }
350
351 print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
ffc61ed2 352 print "ok $test\n"; # 72
da450f52
JH
353 $test++;
354}
355
356{
93f04dac
JH
357 # bug id 20000426.003
358
359 use utf8;
360
361 my $s = "\x20\x40\x{80}\x{100}\x{80}\x40\x20";
362
363 my ($a, $b, $c) = split(/\x40/, $s);
364 print "not "
365 unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
366 print "ok $test\n";
ffc61ed2 367 $test++; # 73
93f04dac
JH
368
369 my ($a, $b) = split(/\x{100}/, $s);
370 print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
371 print "ok $test\n";
ffc61ed2 372 $test++; # 74
93f04dac
JH
373
374 my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
375 print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
376 print "ok $test\n";
ffc61ed2 377 $test++; # 75
93f04dac
JH
378
379 my ($a, $b) = split(/\x40\x{80}/, $s);
380 print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
381 print "ok $test\n";
ffc61ed2 382 $test++; # 76
93f04dac
JH
383
384 my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
385 print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
386 print "ok $test\n";
ffc61ed2 387 $test++; # 77
93f04dac 388}
60ff4832
JH
389
390{
391 # bug id 20000730.004
392
393 use utf8;
394
395 my $smiley = "\x{263a}";
396
ffc61ed2
JH
397 for my $s ("\x{263a}", # 78
398 $smiley, # 79
60ff4832 399
ffc61ed2
JH
400 "" . $smiley, # 80
401 "" . "\x{263a}", # 81
60ff4832 402
ffc61ed2
JH
403 $smiley . "", # 82
404 "\x{263a}" . "", # 83
60ff4832
JH
405 ) {
406 my $length_chars = length($s);
407 my $length_bytes;
408 { use bytes; $length_bytes = length($s) }
409 my @regex_chars = $s =~ m/(.)/g;
410 my $regex_chars = @regex_chars;
411 my @split_chars = split //, $s;
412 my $split_chars = @split_chars;
413 print "not "
414 unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
415 "1/1/1/3";
416 print "ok $test\n";
417 $test++;
418 }
419
ffc61ed2
JH
420 for my $s ("\x{263a}" . "\x{263a}", # 84
421 $smiley . $smiley, # 85
60ff4832 422
ffc61ed2
JH
423 "\x{263a}\x{263a}", # 86
424 "$smiley$smiley", # 87
60ff4832 425
ffc61ed2
JH
426 "\x{263a}" x 2, # 88
427 $smiley x 2, # 89
60ff4832
JH
428 ) {
429 my $length_chars = length($s);
430 my $length_bytes;
431 { use bytes; $length_bytes = length($s) }
432 my @regex_chars = $s =~ m/(.)/g;
433 my $regex_chars = @regex_chars;
434 my @split_chars = split //, $s;
435 my $split_chars = @split_chars;
436 print "not "
437 unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
438 "2/2/2/6";
439 print "ok $test\n";
440 $test++;
441 }
442}
ffc61ed2
JH
443
444{
445 use utf8;
446
447 print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
448 print "ok $test\n";
449 $test++; # 90
450
451 print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
452 print "ok $test\n";
453 $test++; # 91
454
455 print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
456 print "ok $test\n";
457 $test++; # 92
458
459 print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
460 print "ok $test\n";
461 $test++; # 93
462
463 print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
464 print "ok $test\n";
465 $test++; # 94
466
467 print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
468 print "ok $test\n";
469 $test++; # 95
470
471 print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
472 print "ok $test\n";
473 $test++; # 96
474
475 print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
476 print "ok $test\n";
477 $test++; # 97
478}
479
480{
481 # the first half of 20001028.003
482
483 my $X = chr(1448);
484 my ($Y) = $X =~ /(.*)/;
ed7a760f 485 print "not " unless $Y eq v1448 && length($Y) == 1;
ffc61ed2
JH
486 print "ok $test\n";
487 $test++; # 98
488}
489
490{
491 # 20001108.001
492
493 use utf8;
494 my $X = "Szab\x{f3},Bal\x{e1}zs";
495 my $Y = $X;
496 $Y =~ s/(B)/$1/ for 0..3;
ed7a760f 497 print "not " unless $Y eq $X && $X eq "Szab\x{f3},Bal\x{e1}zs";
ffc61ed2
JH
498 print "ok $test\n";
499 $test++; # 99
500}
501
502{
503 # 20001114.001
504
505 use utf8;
506 use charnames ':full';
507 my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}";
ed7a760f 508 print "not " unless $text eq "\xc4" && ord($text) == 0xc4;
ffc61ed2
JH
509 print "ok $test\n";
510 $test++; # 100
511}
512
513{
514 # 20001205.014
515
516 use utf8;
517
518 my $a = "ABC\x{263A}";
519
520 my @b = split( //, $a );
521
522 print "not " unless @b == 4;
523 print "ok $test\n";
524 $test++; # 101
525
ed7a760f 526 print "not " unless length($b[3]) == 1 && $b[3] eq "\x{263A}";
ffc61ed2
JH
527 print "ok $test\n";
528 $test++; # 102
529
530 $a =~ s/^A/Z/;
ed7a760f 531 print "not " unless length($a) == 4 && $a eq "ZBC\x{263A}";
ffc61ed2
JH
532 print "ok $test\n";
533 $test++; # 103
534}
535
536{
537 # the second half of 20001028.003
538
539 use utf8;
540 $X =~ s/^/chr(1488)/e;
ed7a760f 541 print "not " unless length $X == 1 && ord($X) == 1488;
ffc61ed2
JH
542 print "ok $test\n";
543 $test++; # 104
544}
545
11c06dba
JH
546{
547 # 20000517.001
548
549 my $x = "\x{100}A";
550
551 $x =~ s/A/B/;
552
553 print "not " unless $x eq "\x{100}B" && length($x) == 2;
554 print "ok $test\n";
555 $test++; # 105
556}
699c3c34
JH
557
558{
559 use utf8;
560
561 my @a = split(/\xFE/, "\xFF\xFE\xFD");
562
563 print "not " unless @a == 2 && $a[0] eq "\xFF" && $a[1] eq "\xFD";
564 print "ok $test\n";
565 $test++; # 106
566}
f9a63242
JH
567
568{
569 use utf8;
570
571 my $w = 0;
572 local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
573 my $x = eval q/"\\/ . "\x{100}" . q/"/;;
574
575 print "not " unless $w == 0 && $x eq "\x{100}";
576 print "ok $test\n";
577 $test++; # 107
578}
579
3baa4c62
JH
580{
581 # bug id 20001230.002
582
583 use utf8;
584
585 print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c';
586 print "ok $test\n";
587 $test++; # 108
588
589 print "not " unless "École" =~ /^\C\C(c)/;
590 print "ok $test\n";
591 $test++; # 109
592}