Commit | Line | Data |
---|---|---|
e425a60b YO |
1 | #!./perl |
2 | # | |
3 | # This is a home for regular expression tests that don't fit into | |
4 | # the format supported by re/regexp.t. If you want to add a test | |
5 | # that does fit that format, add it to re/re_tests, not here. | |
6 | ||
7 | use strict; | |
8 | use warnings; | |
9 | use 5.010; | |
10 | ||
11 | ||
12 | sub run_tests; | |
13 | ||
14 | $| = 1; | |
15 | ||
e425a60b YO |
16 | |
17 | BEGIN { | |
c859a0d9 NC |
18 | if (!defined &DynaLoader::boot_DynaLoader) { |
19 | print "1..0 # Skip miniperl can't load Tie::Hash::NamedCapture, need for %+ and %-\n"; | |
20 | exit 0; | |
21 | } | |
e425a60b | 22 | chdir 't' if -d 't'; |
9d45b377 YO |
23 | @INC = ('../lib','.'); |
24 | do "re/ReTest.pl" or die $@; | |
e425a60b | 25 | } |
e425a60b | 26 | |
e425a60b | 27 | |
84193928 | 28 | plan tests => 1344; # Update this when adding/deleting tests. |
e425a60b | 29 | |
9d45b377 | 30 | run_tests() unless caller; |
e425a60b YO |
31 | |
32 | # | |
33 | # Tests start here. | |
34 | # | |
35 | sub run_tests { | |
36 | ||
e425a60b YO |
37 | SKIP: |
38 | { | |
39 | local $Message = '\C matches octet'; | |
40 | $_ = "a\x{100}b"; | |
41 | ok /(.)(\C)(\C)(.)/ or skip q [\C doesn't match], 4; | |
42 | iseq $1, "a"; | |
43 | if ($IS_ASCII) { # ASCII (or equivalent), should be UTF-8 | |
44 | iseq $2, "\xC4"; | |
45 | iseq $3, "\x80"; | |
46 | } | |
47 | elsif ($IS_EBCDIC) { # EBCDIC (or equivalent), should be UTF-EBCDIC | |
48 | iseq $2, "\x8C"; | |
49 | iseq $3, "\x41"; | |
50 | } | |
51 | else { | |
52 | SKIP: { | |
53 | ok 0, "Unexpected platform", "ord ('A') = $ordA"; | |
54 | skip "Unexpected platform"; | |
55 | } | |
56 | } | |
57 | iseq $4, "b"; | |
58 | } | |
59 | ||
60 | ||
61 | SKIP: | |
62 | { | |
63 | local $Message = '\C matches octet'; | |
64 | $_ = "\x{100}"; | |
65 | ok /(\C)/g or skip q [\C doesn't match], 2; | |
66 | if ($IS_ASCII) { | |
67 | iseq $1, "\xC4"; | |
68 | } | |
69 | elsif ($IS_EBCDIC) { | |
70 | iseq $1, "\x8C"; | |
71 | } | |
72 | else { | |
73 | ok 0, "Unexpected platform", "ord ('A') = $ordA"; | |
74 | } | |
75 | ok /(\C)/g or skip q [\C doesn't match]; | |
76 | if ($IS_ASCII) { | |
77 | iseq $1, "\x80"; | |
78 | } | |
79 | elsif ($IS_EBCDIC) { | |
80 | iseq $1, "\x41"; | |
81 | } | |
82 | else { | |
83 | ok 0, "Unexpected platform", "ord ('A') = $ordA"; | |
84 | } | |
85 | } | |
86 | ||
87 | ||
88 | { | |
89 | # Japhy -- added 03/03/2001 | |
90 | () = (my $str = "abc") =~ /(...)/; | |
91 | $str = "def"; | |
92 | iseq $1, "abc", 'Changing subject does not modify $1'; | |
93 | } | |
94 | ||
95 | ||
96 | SKIP: | |
97 | { | |
98 | # The trick is that in EBCDIC the explicit numeric range should | |
99 | # match (as also in non-EBCDIC) but the explicit alphabetic range | |
100 | # should not match. | |
101 | ok "\x8e" =~ /[\x89-\x91]/, '"\x8e" =~ /[\x89-\x91]/'; | |
102 | ok "\xce" =~ /[\xc9-\xd1]/, '"\xce" =~ /[\xc9-\xd1]/'; | |
103 | ||
104 | skip "Not an EBCDIC platform", 2 unless ord ('i') == 0x89 && | |
105 | ord ('J') == 0xd1; | |
106 | ||
107 | # In most places these tests would succeed since \x8e does not | |
108 | # in most character sets match 'i' or 'j' nor would \xce match | |
109 | # 'I' or 'J', but strictly speaking these tests are here for | |
110 | # the good of EBCDIC, so let's test these only there. | |
111 | nok "\x8e" !~ /[i-j]/, '"\x8e" !~ /[i-j]/'; | |
112 | nok "\xce" !~ /[I-J]/, '"\xce" !~ /[I-J]/'; | |
113 | } | |
114 | ||
115 | ||
116 | { | |
117 | ok "\x{ab}" =~ /\x{ab}/, '"\x{ab}" =~ /\x{ab}/ '; | |
118 | ok "\x{abcd}" =~ /\x{abcd}/, '"\x{abcd}" =~ /\x{abcd}/'; | |
119 | } | |
120 | ||
121 | ||
122 | { | |
123 | local $Message = 'bug id 20001008.001'; | |
124 | ||
125 | my @x = ("stra\337e 138", "stra\337e 138"); | |
126 | for (@x) { | |
127 | ok s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; | |
128 | ok my ($latin) = /^(.+)(?:\s+\d)/; | |
129 | iseq $latin, "stra\337e"; | |
0f289c68 | 130 | ok $latin =~ s/stra\337e/straße/; |
e425a60b YO |
131 | # |
132 | # Previous code follows, but outcommented - there were no tests. | |
133 | # | |
134 | # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a | |
135 | # use utf8; # needed for the raw UTF-8 | |
136 | # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a | |
137 | } | |
138 | } | |
139 | ||
140 | ||
141 | { | |
142 | local $Message = 'Test \x escapes'; | |
143 | ok "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; | |
144 | ok "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; | |
145 | ok "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; | |
146 | ok "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; | |
147 | ok "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; | |
148 | ok "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; | |
149 | ok "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; | |
150 | ok "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; | |
151 | } | |
152 | ||
153 | ||
e425a60b YO |
154 | SKIP: |
155 | { | |
156 | local $Message = 'Match code points > 255'; | |
157 | $_ = "abc\x{100}\x{200}\x{300}\x{380}\x{400}defg"; | |
158 | ok /(.\x{300})./ or skip "No match", 4; | |
159 | ok $` eq "abc\x{100}" && length ($`) == 4; | |
160 | ok $& eq "\x{200}\x{300}\x{380}" && length ($&) == 3; | |
161 | ok $' eq "\x{400}defg" && length ($') == 5; | |
162 | ok $1 eq "\x{200}\x{300}" && length ($1) == 2; | |
163 | } | |
164 | ||
165 | ||
e425a60b YO |
166 | |
167 | { | |
168 | my $x = "\x{10FFFD}"; | |
169 | $x =~ s/(.)/$1/g; | |
170 | ok ord($x) == 0x10FFFD && length($x) == 1, "From Robin Houston"; | |
171 | } | |
172 | ||
173 | ||
174 | { | |
175 | my %d = ( | |
176 | "7f" => [0, 0, 0], | |
177 | "80" => [1, 1, 0], | |
178 | "ff" => [1, 1, 0], | |
179 | "100" => [0, 1, 1], | |
180 | ); | |
a2d248f9 | 181 | |
e425a60b YO |
182 | while (my ($code, $match) = each %d) { |
183 | local $Message = "Properties of \\x$code"; | |
184 | my $char = eval qq ["\\x{$code}"]; | |
a2d248f9 NC |
185 | |
186 | iseq(0 + ($char =~ /[\x80-\xff]/), $$match[0]); | |
187 | iseq(0 + ($char =~ /[\x80-\x{100}]/), $$match[1]); | |
188 | iseq(0 + ($char =~ /[\x{100}]/), $$match[2]); | |
e425a60b YO |
189 | } |
190 | } | |
191 | ||
192 | ||
193 | { | |
194 | # From Japhy | |
195 | local $Message; | |
196 | must_warn 'qr/(?c)/', '^Useless \(\?c\)'; | |
197 | must_warn 'qr/(?-c)/', '^Useless \(\?-c\)'; | |
198 | must_warn 'qr/(?g)/', '^Useless \(\?g\)'; | |
199 | must_warn 'qr/(?-g)/', '^Useless \(\?-g\)'; | |
200 | must_warn 'qr/(?o)/', '^Useless \(\?o\)'; | |
201 | must_warn 'qr/(?-o)/', '^Useless \(\?-o\)'; | |
202 | ||
203 | # Now test multi-error regexes | |
204 | must_warn 'qr/(?g-o)/', '^Useless \(\?g\).*\nUseless \(\?-o\)'; | |
205 | must_warn 'qr/(?g-c)/', '^Useless \(\?g\).*\nUseless \(\?-c\)'; | |
206 | # (?c) means (?g) error won't be thrown | |
207 | must_warn 'qr/(?o-cg)/', '^Useless \(\?o\).*\nUseless \(\?-c\)'; | |
208 | must_warn 'qr/(?ogc)/', '^Useless \(\?o\).*\nUseless \(\?g\).*\n' . | |
209 | 'Useless \(\?c\)'; | |
210 | } | |
211 | ||
212 | ||
213 | { | |
214 | local $Message = "/x tests"; | |
215 | $_ = "foo"; | |
216 | eval_ok <<" --"; | |
217 | /f | |
218 | o\r | |
219 | o | |
220 | \$ | |
221 | /x | |
222 | -- | |
223 | eval_ok <<" --"; | |
224 | /f | |
225 | o | |
226 | o | |
227 | \$\r | |
228 | /x | |
229 | -- | |
230 | } | |
231 | ||
232 | ||
233 | { | |
234 | local $Message = "/o feature"; | |
235 | sub test_o {$_ [0] =~ /$_[1]/o; return $1} | |
236 | iseq test_o ('abc', '(.)..'), 'a'; | |
237 | iseq test_o ('abc', '..(.)'), 'a'; | |
238 | } | |
239 | ||
e425a60b YO |
240 | { |
241 | # Test basic $^N usage outside of a regex | |
242 | local $Message = '$^N usage outside of a regex'; | |
243 | my $x = "abcdef"; | |
244 | ok ($x =~ /cde/ and !defined $^N); | |
245 | ok ($x =~ /(cde)/ and $^N eq "cde"); | |
246 | ok ($x =~ /(c)(d)(e)/ and $^N eq "e"); | |
247 | ok ($x =~ /(c(d)e)/ and $^N eq "cde"); | |
248 | ok ($x =~ /(foo)|(c(d)e)/ and $^N eq "cde"); | |
249 | ok ($x =~ /(c(d)e)|(foo)/ and $^N eq "cde"); | |
250 | ok ($x =~ /(c(d)e)|(abc)/ and $^N eq "abc"); | |
251 | ok ($x =~ /(c(d)e)|(abc)x/ and $^N eq "cde"); | |
252 | ok ($x =~ /(c(d)e)(abc)?/ and $^N eq "cde"); | |
253 | ok ($x =~ /(?:c(d)e)/ and $^N eq "d"); | |
254 | ok ($x =~ /(?:c(d)e)(?:f)/ and $^N eq "d"); | |
255 | ok ($x =~ /(?:([abc])|([def]))*/ and $^N eq "f"); | |
256 | ok ($x =~ /(?:([ace])|([bdf]))*/ and $^N eq "f"); | |
257 | ok ($x =~ /(([ace])|([bd]))*/ and $^N eq "e"); | |
258 | {ok ($x =~ /(([ace])|([bdf]))*/ and $^N eq "f");} | |
259 | ## Test to see if $^N is automatically localized -- it should now | |
260 | ## have the value set in the previous test. | |
261 | iseq $^N, "e", '$^N is automatically localized'; | |
262 | ||
263 | # Now test inside (?{ ... }) | |
264 | local $Message = '$^N usage inside (?{ ... })'; | |
265 | our ($y, $z); | |
266 | ok ($x =~ /a([abc])(?{$y=$^N})c/ and $y eq "b"); | |
267 | ok ($x =~ /a([abc]+)(?{$y=$^N})d/ and $y eq "bc"); | |
268 | ok ($x =~ /a([abcdefg]+)(?{$y=$^N})d/ and $y eq "bc"); | |
269 | ok ($x =~ /(a([abcdefg]+)(?{$y=$^N})d)(?{$z=$^N})e/ and $y eq "bc" | |
270 | and $z eq "abcd"); | |
271 | ok ($x =~ /(a([abcdefg]+)(?{$y=$^N})de)(?{$z=$^N})/ and $y eq "bc" | |
272 | and $z eq "abcde"); | |
273 | ||
274 | } | |
275 | ||
276 | ||
277 | SKIP: | |
278 | { | |
279 | ## Should probably put in tests for all the POSIX stuff, | |
280 | ## but not sure how to guarantee a specific locale...... | |
281 | ||
282 | skip "Not an ASCII platform", 2 unless $IS_ASCII; | |
283 | local $Message = 'Test [[:cntrl:]]'; | |
284 | my $AllBytes = join "" => map {chr} 0 .. 255; | |
285 | (my $x = $AllBytes) =~ s/[[:cntrl:]]//g; | |
286 | iseq $x, join "", map {chr} 0x20 .. 0x7E, 0x80 .. 0xFF; | |
287 | ||
288 | ($x = $AllBytes) =~ s/[^[:cntrl:]]//g; | |
289 | iseq $x, join "", map {chr} 0x00 .. 0x1F, 0x7F; | |
290 | } | |
291 | ||
292 | ||
293 | { | |
294 | # With /s modifier UTF8 chars were interpreted as bytes | |
295 | local $Message = "UTF-8 chars aren't bytes"; | |
296 | my $a = "Hello \x{263A} World"; | |
297 | my @a = ($a =~ /./gs); | |
298 | iseq $#a, 12; | |
299 | } | |
300 | ||
301 | ||
302 | { | |
303 | local $Message = '. matches \n with /s'; | |
304 | my $str1 = "foo\nbar"; | |
305 | my $str2 = "foo\n\x{100}bar"; | |
306 | my ($a, $b) = map {chr} $IS_ASCII ? (0xc4, 0x80) : (0x8c, 0x41); | |
307 | my @a; | |
308 | @a = $str1 =~ /./g; iseq @a, 6; iseq "@a", "f o o b a r"; | |
309 | @a = $str1 =~ /./gs; iseq @a, 7; iseq "@a", "f o o \n b a r"; | |
310 | @a = $str1 =~ /\C/g; iseq @a, 7; iseq "@a", "f o o \n b a r"; | |
311 | @a = $str1 =~ /\C/gs; iseq @a, 7; iseq "@a", "f o o \n b a r"; | |
312 | @a = $str2 =~ /./g; iseq @a, 7; iseq "@a", "f o o \x{100} b a r"; | |
313 | @a = $str2 =~ /./gs; iseq @a, 8; iseq "@a", "f o o \n \x{100} b a r"; | |
314 | @a = $str2 =~ /\C/g; iseq @a, 9; iseq "@a", "f o o \n $a $b b a r"; | |
315 | @a = $str2 =~ /\C/gs; iseq @a, 9; iseq "@a", "f o o \n $a $b b a r"; | |
316 | } | |
317 | ||
318 | ||
319 | { | |
e425a60b YO |
320 | no warnings 'digit'; |
321 | # Check that \x## works. 5.6.1 and 5.005_03 fail some of these. | |
322 | my $x; | |
323 | $x = "\x4e" . "E"; | |
324 | ok ($x =~ /^\x4EE$/, "Check only 2 bytes of hex are matched."); | |
325 | ||
326 | $x = "\x4e" . "i"; | |
327 | ok ($x =~ /^\x4Ei$/, "Check that invalid hex digit stops it (2)"); | |
328 | ||
329 | $x = "\x4" . "j"; | |
330 | ok ($x =~ /^\x4j$/, "Check that invalid hex digit stops it (1)"); | |
331 | ||
332 | $x = "\x0" . "k"; | |
333 | ok ($x =~ /^\xk$/, "Check that invalid hex digit stops it (0)"); | |
334 | ||
335 | $x = "\x0" . "x"; | |
336 | ok ($x =~ /^\xx$/, "\\xx isn't to be treated as \\0"); | |
337 | ||
338 | $x = "\x0" . "xa"; | |
339 | ok ($x =~ /^\xxa$/, "\\xxa isn't to be treated as \\xa"); | |
340 | ||
341 | $x = "\x9" . "_b"; | |
342 | ok ($x =~ /^\x9_b$/, "\\x9_b isn't to be treated as \\x9b"); | |
343 | ||
344 | # and now again in [] ranges | |
345 | ||
346 | $x = "\x4e" . "E"; | |
347 | ok ($x =~ /^[\x4EE]{2}$/, "Check only 2 bytes of hex are matched."); | |
348 | ||
349 | $x = "\x4e" . "i"; | |
350 | ok ($x =~ /^[\x4Ei]{2}$/, "Check that invalid hex digit stops it (2)"); | |
351 | ||
352 | $x = "\x4" . "j"; | |
353 | ok ($x =~ /^[\x4j]{2}$/, "Check that invalid hex digit stops it (1)"); | |
354 | ||
355 | $x = "\x0" . "k"; | |
356 | ok ($x =~ /^[\xk]{2}$/, "Check that invalid hex digit stops it (0)"); | |
357 | ||
358 | $x = "\x0" . "x"; | |
359 | ok ($x =~ /^[\xx]{2}$/, "\\xx isn't to be treated as \\0"); | |
360 | ||
361 | $x = "\x0" . "xa"; | |
362 | ok ($x =~ /^[\xxa]{3}$/, "\\xxa isn't to be treated as \\xa"); | |
363 | ||
364 | $x = "\x9" . "_b"; | |
365 | ok ($x =~ /^[\x9_b]{3}$/, "\\x9_b isn't to be treated as \\x9b"); | |
366 | ||
367 | # Check that \x{##} works. 5.6.1 fails quite a few of these. | |
368 | ||
369 | $x = "\x9b"; | |
370 | ok ($x =~ /^\x{9_b}$/, "\\x{9_b} is to be treated as \\x9b"); | |
371 | ||
372 | $x = "\x9b" . "y"; | |
373 | ok ($x =~ /^\x{9_b}y$/, "\\x{9_b} is to be treated as \\x9b (again)"); | |
374 | ||
375 | $x = "\x9b" . "y"; | |
376 | ok ($x =~ /^\x{9b_}y$/, "\\x{9b_} is to be treated as \\x9b"); | |
377 | ||
378 | $x = "\x9b" . "y"; | |
379 | ok ($x =~ /^\x{9_bq}y$/, "\\x{9_bc} is to be treated as \\x9b"); | |
380 | ||
381 | $x = "\x0" . "y"; | |
382 | ok ($x =~ /^\x{x9b}y$/, "\\x{x9b} is to be treated as \\x0"); | |
383 | ||
384 | $x = "\x0" . "y"; | |
385 | ok ($x =~ /^\x{0x9b}y$/, "\\x{0x9b} is to be treated as \\x0"); | |
386 | ||
387 | $x = "\x9b" . "y"; | |
388 | ok ($x =~ /^\x{09b}y$/, "\\x{09b} is to be treated as \\x9b"); | |
389 | ||
390 | $x = "\x9b"; | |
391 | ok ($x =~ /^[\x{9_b}]$/, "\\x{9_b} is to be treated as \\x9b"); | |
392 | ||
393 | $x = "\x9b" . "y"; | |
394 | ok ($x =~ /^[\x{9_b}y]{2}$/, | |
395 | "\\x{9_b} is to be treated as \\x9b (again)"); | |
396 | ||
397 | $x = "\x9b" . "y"; | |
398 | ok ($x =~ /^[\x{9b_}y]{2}$/, "\\x{9b_} is to be treated as \\x9b"); | |
399 | ||
400 | $x = "\x9b" . "y"; | |
401 | ok ($x =~ /^[\x{9_bq}y]{2}$/, "\\x{9_bc} is to be treated as \\x9b"); | |
402 | ||
403 | $x = "\x0" . "y"; | |
404 | ok ($x =~ /^[\x{x9b}y]{2}$/, "\\x{x9b} is to be treated as \\x0"); | |
405 | ||
406 | $x = "\x0" . "y"; | |
407 | ok ($x =~ /^[\x{0x9b}y]{2}$/, "\\x{0x9b} is to be treated as \\x0"); | |
408 | ||
409 | $x = "\x9b" . "y"; | |
410 | ok ($x =~ /^[\x{09b}y]{2}$/, "\\x{09b} is to be treated as \\x9b"); | |
411 | ||
412 | } | |
413 | ||
414 | ||
415 | { | |
416 | # High bit bug -- japhy | |
417 | my $x = "ab\200d"; | |
418 | ok $x =~ /.*?\200/, "High bit fine"; | |
419 | } | |
420 | ||
421 | ||
422 | { | |
423 | # The basic character classes and Unicode | |
424 | ok "\x{0100}" =~ /\w/, 'LATIN CAPITAL LETTER A WITH MACRON in /\w/'; | |
425 | ok "\x{0660}" =~ /\d/, 'ARABIC-INDIC DIGIT ZERO in /\d/'; | |
426 | ok "\x{1680}" =~ /\s/, 'OGHAM SPACE MARK in /\s/'; | |
427 | } | |
428 | ||
429 | ||
430 | { | |
431 | local $Message = "Folding matches and Unicode"; | |
432 | ok "a\x{100}" =~ /A/i; | |
433 | ok "A\x{100}" =~ /a/i; | |
434 | ok "a\x{100}" =~ /a/i; | |
435 | ok "A\x{100}" =~ /A/i; | |
436 | ok "\x{101}a" =~ /\x{100}/i; | |
437 | ok "\x{100}a" =~ /\x{100}/i; | |
438 | ok "\x{101}a" =~ /\x{101}/i; | |
439 | ok "\x{100}a" =~ /\x{101}/i; | |
440 | ok "a\x{100}" =~ /A\x{100}/i; | |
441 | ok "A\x{100}" =~ /a\x{100}/i; | |
442 | ok "a\x{100}" =~ /a\x{100}/i; | |
443 | ok "A\x{100}" =~ /A\x{100}/i; | |
444 | ok "a\x{100}" =~ /[A]/i; | |
445 | ok "A\x{100}" =~ /[a]/i; | |
446 | ok "a\x{100}" =~ /[a]/i; | |
447 | ok "A\x{100}" =~ /[A]/i; | |
448 | ok "\x{101}a" =~ /[\x{100}]/i; | |
449 | ok "\x{100}a" =~ /[\x{100}]/i; | |
450 | ok "\x{101}a" =~ /[\x{101}]/i; | |
451 | ok "\x{100}a" =~ /[\x{101}]/i; | |
452 | } | |
453 | ||
454 | ||
455 | { | |
456 | use charnames ':full'; | |
457 | local $Message = "Folding 'LATIN LETTER A WITH GRAVE'"; | |
458 | ||
459 | my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}"; | |
460 | my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}"; | |
0f289c68 | 461 | |
e425a60b YO |
462 | ok $lower =~ m/$UPPER/i; |
463 | ok $UPPER =~ m/$lower/i; | |
464 | ok $lower =~ m/[$UPPER]/i; | |
465 | ok $UPPER =~ m/[$lower]/i; | |
466 | ||
467 | local $Message = "Folding 'GREEK LETTER ALPHA WITH VRACHY'"; | |
468 | ||
469 | $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}"; | |
470 | $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}"; | |
471 | ||
472 | ok $lower =~ m/$UPPER/i; | |
473 | ok $UPPER =~ m/$lower/i; | |
474 | ok $lower =~ m/[$UPPER]/i; | |
475 | ok $UPPER =~ m/[$lower]/i; | |
476 | ||
477 | local $Message = "Folding 'LATIN LETTER Y WITH DIAERESIS'"; | |
478 | ||
479 | $lower = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; | |
480 | $UPPER = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"; | |
481 | ||
482 | ok $lower =~ m/$UPPER/i; | |
483 | ok $UPPER =~ m/$lower/i; | |
484 | ok $lower =~ m/[$UPPER]/i; | |
485 | ok $UPPER =~ m/[$lower]/i; | |
486 | } | |
487 | ||
488 | ||
489 | { | |
490 | use charnames ':full'; | |
e425a60b YO |
491 | local $Message = "GREEK CAPITAL LETTER SIGMA vs " . |
492 | "COMBINING GREEK PERISPOMENI"; | |
493 | ||
494 | my $SIGMA = "\N{GREEK CAPITAL LETTER SIGMA}"; | |
495 | my $char = "\N{COMBINING GREEK PERISPOMENI}"; | |
496 | ||
1c6f1211 NC |
497 | may_not_warn sub {ok "_:$char:_" !~ m/_:$SIGMA:_/i}, |
498 | 'Did not warn [change a5961de5f4215b5c]'; | |
e425a60b YO |
499 | } |
500 | ||
501 | ||
502 | { | |
503 | local $Message = '\X'; | |
504 | use charnames ':full'; | |
505 | ||
506 | ok "a!" =~ /^(\X)!/ && $1 eq "a"; | |
507 | ok "\xDF!" =~ /^(\X)!/ && $1 eq "\xDF"; | |
508 | ok "\x{100}!" =~ /^(\X)!/ && $1 eq "\x{100}"; | |
509 | ok "\x{100}\x{300}!" =~ /^(\X)!/ && $1 eq "\x{100}\x{300}"; | |
510 | ok "\N{LATIN CAPITAL LETTER E}!" =~ /^(\X)!/ && | |
511 | $1 eq "\N{LATIN CAPITAL LETTER E}"; | |
512 | ok "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}!" | |
513 | =~ /^(\X)!/ && | |
514 | $1 eq "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}"; | |
515 | ||
516 | local $Message = '\C and \X'; | |
517 | ok "!abc!" =~ /a\Cc/; | |
518 | ok "!abc!" =~ /a\Xc/; | |
519 | } | |
520 | ||
521 | ||
522 | { | |
523 | local $Message = "Final Sigma"; | |
524 | ||
525 | my $SIGMA = "\x{03A3}"; # CAPITAL | |
526 | my $Sigma = "\x{03C2}"; # SMALL FINAL | |
527 | my $sigma = "\x{03C3}"; # SMALL | |
528 | ||
529 | ok $SIGMA =~ /$SIGMA/i; | |
530 | ok $SIGMA =~ /$Sigma/i; | |
531 | ok $SIGMA =~ /$sigma/i; | |
532 | ||
533 | ok $Sigma =~ /$SIGMA/i; | |
534 | ok $Sigma =~ /$Sigma/i; | |
535 | ok $Sigma =~ /$sigma/i; | |
536 | ||
537 | ok $sigma =~ /$SIGMA/i; | |
538 | ok $sigma =~ /$Sigma/i; | |
539 | ok $sigma =~ /$sigma/i; | |
0f289c68 | 540 | |
e425a60b YO |
541 | ok $SIGMA =~ /[$SIGMA]/i; |
542 | ok $SIGMA =~ /[$Sigma]/i; | |
543 | ok $SIGMA =~ /[$sigma]/i; | |
544 | ||
545 | ok $Sigma =~ /[$SIGMA]/i; | |
546 | ok $Sigma =~ /[$Sigma]/i; | |
547 | ok $Sigma =~ /[$sigma]/i; | |
548 | ||
549 | ok $sigma =~ /[$SIGMA]/i; | |
550 | ok $sigma =~ /[$Sigma]/i; | |
551 | ok $sigma =~ /[$sigma]/i; | |
552 | ||
553 | local $Message = "More final Sigma"; | |
554 | ||
555 | my $S3 = "$SIGMA$Sigma$sigma"; | |
556 | ||
557 | ok ":$S3:" =~ /:(($SIGMA)+):/i && $1 eq $S3 && $2 eq $sigma; | |
558 | ok ":$S3:" =~ /:(($Sigma)+):/i && $1 eq $S3 && $2 eq $sigma; | |
559 | ok ":$S3:" =~ /:(($sigma)+):/i && $1 eq $S3 && $2 eq $sigma; | |
560 | ||
561 | ok ":$S3:" =~ /:(([$SIGMA])+):/i && $1 eq $S3 && $2 eq $sigma; | |
562 | ok ":$S3:" =~ /:(([$Sigma])+):/i && $1 eq $S3 && $2 eq $sigma; | |
563 | ok ":$S3:" =~ /:(([$sigma])+):/i && $1 eq $S3 && $2 eq $sigma; | |
564 | } | |
565 | ||
566 | ||
567 | { | |
568 | use charnames ':full'; | |
569 | local $Message = "Parlez-Vous " . | |
570 | "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais?"; | |
571 | ||
572 | ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran.ais/ && | |
573 | $& eq "Francais"; | |
574 | ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran.ais/ && | |
575 | $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; | |
576 | ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Cais/ && | |
577 | $& eq "Francais"; | |
578 | # COMBINING CEDILLA is two bytes when encoded | |
579 | ok "Franc\N{COMBINING CEDILLA}ais" =~ /Franc\C\Cais/; | |
580 | ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Xais/ && | |
581 | $& eq "Francais"; | |
582 | ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran\Xais/ && | |
583 | $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; | |
584 | ok "Franc\N{COMBINING CEDILLA}ais" =~ /Fran\Xais/ && | |
585 | $& eq "Franc\N{COMBINING CEDILLA}ais"; | |
586 | ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ | |
587 | /Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais/ && | |
588 | $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; | |
589 | ok "Franc\N{COMBINING CEDILLA}ais" =~ /Franc\N{COMBINING CEDILLA}ais/ && | |
590 | $& eq "Franc\N{COMBINING CEDILLA}ais"; | |
591 | ||
592 | my @f = ( | |
593 | ["Fran\N{LATIN SMALL LETTER C}ais", "Francais"], | |
594 | ["Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", | |
595 | "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"], | |
596 | ["Franc\N{COMBINING CEDILLA}ais", "Franc\N{COMBINING CEDILLA}ais"], | |
597 | ); | |
598 | foreach my $entry (@f) { | |
599 | my ($subject, $match) = @$entry; | |
600 | ok $subject =~ /Fran(?:c\N{COMBINING CEDILLA}?| | |
601 | \N{LATIN SMALL LETTER C WITH CEDILLA})ais/x && | |
602 | $& eq $match; | |
603 | } | |
604 | } | |
605 | ||
606 | ||
607 | { | |
608 | local $Message = "Lingering (and useless) UTF8 flag doesn't mess up /i"; | |
609 | my $pat = "ABcde"; | |
610 | my $str = "abcDE\x{100}"; | |
611 | chop $str; | |
612 | ok $str =~ /$pat/i; | |
613 | ||
614 | $pat = "ABcde\x{100}"; | |
615 | $str = "abcDE"; | |
616 | chop $pat; | |
617 | ok $str =~ /$pat/i; | |
618 | ||
619 | $pat = "ABcde\x{100}"; | |
620 | $str = "abcDE\x{100}"; | |
621 | chop $pat; | |
622 | chop $str; | |
623 | ok $str =~ /$pat/i; | |
624 | } | |
625 | ||
626 | ||
627 | { | |
628 | use charnames ':full'; | |
629 | local $Message = "LATIN SMALL LETTER SHARP S " . | |
630 | "(\N{LATIN SMALL LETTER SHARP S})"; | |
631 | ||
632 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ | |
633 | /\N{LATIN SMALL LETTER SHARP S}/; | |
634 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ | |
635 | /\N{LATIN SMALL LETTER SHARP S}/i; | |
636 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ | |
637 | /[\N{LATIN SMALL LETTER SHARP S}]/; | |
638 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ | |
639 | /[\N{LATIN SMALL LETTER SHARP S}]/i; | |
640 | ||
641 | ok "ss" =~ /\N{LATIN SMALL LETTER SHARP S}/i; | |
642 | ok "SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i; | |
643 | ok "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i; | |
644 | ok "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i; | |
645 | ||
646 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i; | |
647 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i; | |
0f289c68 | 648 | |
e425a60b YO |
649 | local $Message = "Unoptimized named sequence in class"; |
650 | ok "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i; | |
651 | ok "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i; | |
652 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ | |
653 | /[\N{LATIN SMALL LETTER SHARP S}x]/; | |
654 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ | |
655 | /[\N{LATIN SMALL LETTER SHARP S}x]/i; | |
656 | } | |
657 | ||
658 | ||
659 | { | |
660 | # More whitespace: U+0085, U+2028, U+2029\n"; | |
661 | ||
662 | # U+0085, U+00A0 need to be forced to be Unicode, the \x{100} does that. | |
663 | SKIP: { | |
664 | skip "EBCDIC platform", 4 if $IS_EBCDIC; | |
665 | # Do \x{0015} and \x{0041} match \s in EBCDIC? | |
666 | ok "<\x{100}\x{0085}>" =~ /<\x{100}\s>/, '\x{0085} in \s'; | |
667 | ok "<\x{0085}>" =~ /<\v>/, '\x{0085} in \v'; | |
668 | ok "<\x{100}\x{00A0}>" =~ /<\x{100}\s>/, '\x{00A0} in \s'; | |
669 | ok "<\x{00A0}>" =~ /<\h>/, '\x{00A0} in \h'; | |
670 | } | |
671 | my @h = map {sprintf "%05x" => $_} 0x01680, 0x0180E, 0x02000 .. 0x0200A, | |
672 | 0x0202F, 0x0205F, 0x03000; | |
673 | my @v = map {sprintf "%05x" => $_} 0x02028, 0x02029; | |
674 | ||
675 | my @H = map {sprintf "%05x" => $_} 0x01361, 0x0200B, 0x02408, 0x02420, | |
676 | 0x0303F, 0xE0020; | |
677 | my @V = map {sprintf "%05x" => $_} 0x0008A .. 0x0008D, 0x00348, 0x10100, | |
678 | 0xE005F, 0xE007C; | |
679 | ||
680 | for my $hex (@h) { | |
681 | my $str = eval qq ["<\\x{$hex}>"]; | |
682 | ok $str =~ /<\s>/, "\\x{$hex} in \\s"; | |
683 | ok $str =~ /<\h>/, "\\x{$hex} in \\h"; | |
684 | ok $str !~ /<\v>/, "\\x{$hex} not in \\v"; | |
685 | } | |
686 | ||
687 | for my $hex (@v) { | |
688 | my $str = eval qq ["<\\x{$hex}>"]; | |
689 | ok $str =~ /<\s>/, "\\x{$hex} in \\s"; | |
690 | ok $str =~ /<\v>/, "\\x{$hex} in \\v"; | |
691 | ok $str !~ /<\h>/, "\\x{$hex} not in \\h"; | |
692 | } | |
693 | ||
694 | for my $hex (@H) { | |
695 | my $str = eval qq ["<\\x{$hex}>"]; | |
696 | ok $str =~ /<\S>/, "\\x{$hex} in \\S"; | |
697 | ok $str =~ /<\H>/, "\\x{$hex} in \\H"; | |
698 | } | |
699 | ||
700 | for my $hex (@V) { | |
701 | my $str = eval qq ["<\\x{$hex}>"]; | |
702 | ok $str =~ /<\S>/, "\\x{$hex} in \\S"; | |
703 | ok $str =~ /<\V>/, "\\x{$hex} in \\V"; | |
704 | } | |
705 | } | |
706 | ||
707 | ||
708 | { | |
709 | # . with /s should work on characters, as opposed to bytes | |
710 | local $Message = ". with /s works on characters, not bytes"; | |
711 | ||
712 | my $s = "\x{e4}\x{100}"; | |
713 | # This is not expected to match: the point is that | |
714 | # neither should we get "Malformed UTF-8" warnings. | |
715 | may_not_warn sub {$s =~ /\G(.+?)\n/gcs}, "No 'Malformed UTF-8' warning"; | |
716 | ||
717 | my @c; | |
718 | push @c => $1 while $s =~ /\G(.)/gs; | |
719 | ||
720 | local $" = ""; | |
721 | iseq "@c", $s; | |
722 | ||
723 | # Test only chars < 256 | |
724 | my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; | |
725 | my $r1 = ""; | |
726 | while ($t1 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { | |
0f289c68 | 727 | $r1 .= $1 . $2; |
e425a60b YO |
728 | } |
729 | ||
730 | my $t2 = $t1 . "\x{100}"; # Repeat with a larger char | |
731 | my $r2 = ""; | |
732 | while ($t2 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { | |
0f289c68 | 733 | $r2 .= $1 . $2; |
e425a60b YO |
734 | } |
735 | $r2 =~ s/\x{100}//; | |
736 | ||
737 | iseq $r1, $r2; | |
738 | } | |
739 | ||
740 | ||
741 | { | |
742 | local $Message = "Unicode lookbehind"; | |
743 | ok "A\x{100}B" =~ /(?<=A.)B/; | |
744 | ok "A\x{200}\x{300}B" =~ /(?<=A..)B/; | |
745 | ok "\x{400}AB" =~ /(?<=\x{400}.)B/; | |
746 | ok "\x{500}\x{600}B" =~ /(?<=\x{500}.)B/; | |
747 | ||
748 | # Original code also contained: | |
749 | # ok "\x{500\x{600}}B" =~ /(?<=\x{500}.)B/; | |
750 | # but that looks like a typo. | |
751 | } | |
752 | ||
753 | ||
754 | { | |
755 | local $Message = 'UTF-8 hash keys and /$/'; | |
756 | # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters | |
757 | # /2002-01/msg01327.html | |
758 | ||
759 | my $u = "a\x{100}"; | |
760 | my $v = substr ($u, 0, 1); | |
761 | my $w = substr ($u, 1, 1); | |
762 | my %u = ($u => $u, $v => $v, $w => $w); | |
763 | for (keys %u) { | |
764 | my $m1 = /^\w*$/ ? 1 : 0; | |
765 | my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0; | |
766 | iseq $m1, $m2; | |
767 | } | |
768 | } | |
769 | ||
770 | ||
771 | { | |
e425a60b YO |
772 | local $Message = "No SEGV in s/// and UTF-8"; |
773 | my $s = "s#\x{100}" x 4; | |
774 | ok $s =~ s/[^\w]/ /g; | |
2e84be61 | 775 | if ( 1 or $ENV{PERL_TEST_LEGACY_POSIX_CC} ) { |
e425a60b YO |
776 | iseq $s, "s \x{100}" x 4; |
777 | } | |
778 | else { | |
779 | iseq $s, "s " x 4; | |
780 | } | |
781 | } | |
782 | ||
783 | ||
784 | { | |
785 | local $Message = "UTF-8 bug (maybe already known?)"; | |
786 | my $u = "foo"; | |
787 | $u =~ s/./\x{100}/g; | |
788 | iseq $u, "\x{100}\x{100}\x{100}"; | |
789 | ||
790 | $u = "foobar"; | |
791 | $u =~ s/[ao]/\x{100}/g; | |
792 | iseq $u, "f\x{100}\x{100}b\x{100}r"; | |
793 | ||
794 | $u =~ s/\x{100}/e/g; | |
795 | iseq $u, "feeber"; | |
796 | } | |
797 | ||
798 | ||
799 | { | |
800 | local $Message = "UTF-8 bug with s///"; | |
801 | # check utf8/non-utf8 mixtures | |
802 | # try to force all float/anchored check combinations | |
803 | ||
804 | my $c = "\x{100}"; | |
805 | my $subst; | |
806 | for my $re ("xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", | |
807 | "xx.*(?=$c)", "(?=$c).*xx",) { | |
808 | ok "xxx" !~ /$re/; | |
809 | ok +($subst = "xxx") !~ s/$re//; | |
810 | } | |
811 | for my $re ("xx.*$c*", "$c*.*xx") { | |
812 | ok "xxx" =~ /$re/; | |
813 | ok +($subst = "xxx") =~ s/$re//; | |
814 | iseq $subst, ""; | |
815 | } | |
816 | for my $re ("xxy*", "y*xx") { | |
817 | ok "xx$c" =~ /$re/; | |
818 | ok +($subst = "xx$c") =~ s/$re//; | |
819 | iseq $subst, $c; | |
820 | ok "xy$c" !~ /$re/; | |
821 | ok +($subst = "xy$c") !~ s/$re//; | |
822 | } | |
823 | for my $re ("xy$c*z", "x$c*yz") { | |
824 | ok "xyz" =~ /$re/; | |
825 | ok +($subst = "xyz") =~ s/$re//; | |
826 | iseq $subst, ""; | |
827 | } | |
828 | } | |
829 | ||
830 | ||
831 | { | |
832 | local $Message = "qr /.../x"; | |
833 | my $R = qr / A B C # D E/x; | |
834 | ok "ABCDE" =~ $R && $& eq "ABC"; | |
835 | ok "ABCDE" =~ /$R/ && $& eq "ABC"; | |
836 | ok "ABCDE" =~ m/$R/ && $& eq "ABC"; | |
837 | ok "ABCDE" =~ /($R)/ && $1 eq "ABC"; | |
838 | ok "ABCDE" =~ m/($R)/ && $1 eq "ABC"; | |
839 | } | |
840 | ||
841 | ||
e425a60b | 842 | |
0f289c68 | 843 | |
e425a60b YO |
844 | { |
845 | local $\; | |
846 | $_ = 'aaaaaaaaaa'; | |
847 | utf8::upgrade($_); chop $_; $\="\n"; | |
848 | ok /[^\s]+/, 'm/[^\s]/ utf8'; | |
849 | ok /[^\d]+/, 'm/[^\d]/ utf8'; | |
850 | ok +($a = $_, $_ =~ s/[^\s]+/./g), 's/[^\s]/ utf8'; | |
851 | ok +($a = $_, $a =~ s/[^\d]+/./g), 's/[^\s]/ utf8'; | |
852 | } | |
853 | ||
854 | ||
e425a60b YO |
855 | |
856 | ||
857 | { | |
858 | # Subject: Odd regexp behavior | |
859 | # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk> | |
860 | # Date: Wed, 26 Feb 2003 16:53:12 +0000 | |
861 | # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk> | |
862 | # To: perl-unicode@perl.org | |
863 | ||
864 | local $Message = 'Markus Kuhn 2003-02-26'; | |
0f289c68 | 865 | |
e425a60b YO |
866 | my $x = "\x{2019}\nk"; |
867 | ok $x =~ s/(\S)\n(\S)/$1 $2/sg; | |
868 | ok $x eq "\x{2019} k"; | |
869 | ||
870 | $x = "b\nk"; | |
871 | ok $x =~ s/(\S)\n(\S)/$1 $2/sg; | |
872 | ok $x eq "b k"; | |
873 | ||
874 | ok "\x{2019}" =~ /\S/; | |
875 | } | |
876 | ||
877 | ||
878 | { | |
e425a60b YO |
879 | # XXX DAPM 13-Apr-06. Recursive split is still broken. It's only luck it |
880 | # hasn't been crashing. Disable this test until it is fixed properly. | |
881 | # XXX also check what it returns rather than just doing ok(1,...) | |
882 | # split /(?{ split "" })/, "abc"; | |
883 | local $TODO = "Recursive split is still broken"; | |
884 | ok 0, 'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0'; | |
885 | } | |
886 | ||
887 | ||
888 | { | |
889 | ok "\x{100}\n" =~ /\x{100}\n$/, "UTF-8 length cache and fbm_compile"; | |
890 | } | |
891 | ||
892 | ||
893 | { | |
894 | package Str; | |
895 | use overload q /""/ => sub {${$_ [0]};}; | |
896 | sub new {my ($c, $v) = @_; bless \$v, $c;} | |
897 | ||
898 | package main; | |
899 | $_ = Str -> new ("a\x{100}/\x{100}b"); | |
900 | ok join (":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr"; | |
901 | } | |
902 | ||
e425a60b YO |
903 | { |
904 | my $re = qq /^([^X]*)X/; | |
905 | utf8::upgrade ($re); | |
906 | ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; | |
0ab1ff8e KW |
907 | my $loc_re = qq /(?l:^([^X]*)X)/; |
908 | utf8::upgrade ($loc_re); | |
909 | ok "\x{100}X" =~ /$loc_re/, "locale, S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; | |
e425a60b YO |
910 | } |
911 | ||
e425a60b YO |
912 | { |
913 | ok "123\x{100}" =~ /^.*1.*23\x{100}$/, | |
914 | 'UTF-8 + multiple floating substr'; | |
915 | } | |
916 | ||
e425a60b YO |
917 | { |
918 | local $Message = '<20030808193656.5109.1@llama.ni-s.u-net.com>'; | |
919 | ||
920 | # LATIN SMALL/CAPITAL LETTER A WITH MACRON | |
921 | ok " \x{101}" =~ qr/\x{100}/i; | |
922 | ||
923 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW | |
924 | ok " \x{1E01}" =~ qr/\x{1E00}/i; | |
925 | ||
926 | # DESERET SMALL/CAPITAL LETTER LONG I | |
927 | ok " \x{10428}" =~ qr/\x{10400}/i; | |
928 | ||
929 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' | |
930 | ok " \x{1E01}x" =~ qr/\x{1E00}X/i; | |
931 | } | |
932 | ||
e425a60b YO |
933 | { |
934 | for (120 .. 130) { | |
935 | my $head = 'x' x $_; | |
936 | local $Message = q [Don't misparse \x{...} in regexp ] . | |
937 | q [near 127 char EXACT limit]; | |
938 | for my $tail ('\x{0061}', '\x{1234}', '\x61') { | |
939 | eval_ok qq ["$head$tail" =~ /$head$tail/]; | |
940 | } | |
941 | local $Message = q [Don't misparse \N{...} in regexp ] . | |
942 | q [near 127 char EXACT limit]; | |
943 | for my $tail ('\N{SNOWFLAKE}') { | |
944 | eval_ok qq [use charnames ':full'; | |
945 | "$head$tail" =~ /$head$tail/]; | |
946 | } | |
947 | } | |
948 | } | |
949 | ||
e425a60b YO |
950 | { # TRIE related |
951 | our @got = (); | |
952 | "words" =~ /(word|word|word)(?{push @got, $1})s$/; | |
93f09d7b | 953 | iseq @got, 1, "TRIE optimisation"; |
e425a60b YO |
954 | |
955 | @got = (); | |
956 | "words" =~ /(word|word|word)(?{push @got,$1})s$/i; | |
957 | iseq @got, 1,"TRIEF optimisation"; | |
958 | ||
959 | my @nums = map {int rand 1000} 1 .. 100; | |
960 | my $re = "(" . (join "|", @nums) . ")"; | |
961 | $re = qr/\b$re\b/; | |
962 | ||
963 | foreach (@nums) { | |
964 | ok $_ =~ /$re/, "Trie nums"; | |
965 | } | |
966 | ||
967 | $_ = join " ", @nums; | |
968 | @got = (); | |
969 | push @got, $1 while /$re/g; | |
970 | ||
971 | my %count; | |
972 | $count {$_} ++ for @got; | |
973 | my $ok = 1; | |
974 | for (@nums) { | |
975 | $ok = 0 if --$count {$_} < 0; | |
976 | } | |
977 | ok $ok, "Trie min count matches"; | |
978 | } | |
979 | ||
980 | ||
981 | { | |
982 | # TRIE related | |
983 | # LATIN SMALL/CAPITAL LETTER A WITH MACRON | |
984 | ok "foba \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i && | |
985 | $1 eq "\x{101}foo", | |
986 | "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON"; | |
987 | ||
988 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW | |
989 | ok "foba \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i && | |
990 | $1 eq "\x{1E01}foo", | |
991 | "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW"; | |
992 | ||
993 | # DESERET SMALL/CAPITAL LETTER LONG I | |
994 | ok "foba \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i && | |
995 | $1 eq "\x{10428}foo", | |
996 | "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I"; | |
997 | ||
998 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' | |
999 | ok "foba \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i && | |
1000 | $1 eq "\x{1E01}xfoo", | |
1001 | "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'"; | |
1002 | ||
1003 | use charnames ':full'; | |
1004 | ||
1005 | my $s = "\N{LATIN SMALL LETTER SHARP S}"; | |
1006 | ok "foba ba$s" =~ qr/(foo|Ba$s|bar)/i && $1 eq "ba$s", | |
1007 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; | |
1008 | ok "foba ba$s" =~ qr/(Ba$s|foo|bar)/i && $1 eq "ba$s", | |
1009 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; | |
1010 | ok "foba ba$s" =~ qr/(foo|bar|Ba$s)/i && $1 eq "ba$s", | |
1011 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; | |
1012 | ||
1013 | ok "foba ba$s" =~ qr/(foo|Bass|bar)/i && $1 eq "ba$s", | |
1014 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; | |
1015 | ||
1016 | ok "foba ba$s" =~ qr/(foo|BaSS|bar)/i && $1 eq "ba$s", | |
1017 | "TRIEF + LATIN SMALL LETTER SHARP S =~ SS"; | |
1018 | ||
1019 | ok "foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i | |
1020 | && $1 eq "ba${s}pxySS$s$s", | |
1021 | "COMMON PREFIX TRIEF + LATIN SMALL LETTER SHARP S"; | |
1022 | } | |
1023 | ||
1024 | ||
e425a60b YO |
1025 | |
1026 | ||
1027 | { | |
0f289c68 YO |
1028 | BEGIN { |
1029 | unshift @INC, 'lib'; | |
1030 | } | |
e425a60b | 1031 | use Cname; |
0f289c68 | 1032 | |
e425a60b | 1033 | ok 'fooB' =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname"; |
e425a60b YO |
1034 | # |
1035 | # Why doesn't must_warn work here? | |
1036 | # | |
1037 | my $w; | |
1038 | local $SIG {__WARN__} = sub {$w .= "@_"}; | |
1039 | eval 'q(xxWxx) =~ /[\N{WARN}]/'; | |
ff3f963a KW |
1040 | ok $w && $w =~ /Using just the first character returned by \\N{} in character class/, |
1041 | "single character in [\\N{}] warning"; | |
e425a60b YO |
1042 | |
1043 | undef $w; | |
1044 | eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/, | |
1045 | "Zerolength charname in charclass doesn't match \\0"]; | |
ff3f963a KW |
1046 | ok $w && $w =~ /Ignoring zero length/, |
1047 | 'Ignoring zero length \N{} in character class warning'; | |
e425a60b YO |
1048 | |
1049 | ok 'AB' =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1'; | |
1050 | ok 'ABC' =~ /(\N{EVIL})/, 'Charname caching $1'; | |
1051 | ok 'xy' =~ /x\N{EMPTY-STR}y/, | |
1052 | 'Empty string charname produces NOTHING node'; | |
1053 | ok '' =~ /\N{EMPTY-STR}/, | |
1054 | 'Empty string charname produces NOTHING node'; | |
ff3f963a KW |
1055 | ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works'; |
1056 | ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works'; | |
1057 | ||
1058 | # If remove the limitation in regcomp code these should work | |
1059 | # differently | |
1060 | undef $w; | |
1061 | eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string fails gracefully']; | |
1062 | ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit'; | |
1063 | undef $w; | |
1064 | eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string fails gracefully']; | |
1065 | ok $w && $w =~ /Using just the first characters returned/, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit'; | |
1066 | undef $w; | |
1067 | eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string doesnt work']; | |
1068 | ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit'; | |
1069 | undef $w; | |
1070 | eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string doesnt work']; | |
1071 | ok $w && $w =~ /Using just the first characters returned/i, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit'; | |
1072 | undef $w; | |
1073 | eval 'q(syntax error) =~ /\N{MALFORMED}/'; | |
1074 | ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error'; | |
cb233ae3 KW |
1075 | undef $w; |
1076 | eval 'q() =~ /\N{4F}/'; | |
1077 | ok $w && $w =~ /Deprecated/, 'Verify that leading digit in name gives warning'; | |
1078 | undef $w; | |
1079 | eval 'q() =~ /\N{COM,MA}/'; | |
1080 | ok $w && $w =~ /Deprecated/, 'Verify that comma in name gives warning'; | |
1081 | undef $w; | |
1082 | my $name = "A\x{D7}O"; | |
1083 | eval "q(W) =~ /\\N{$name}/"; | |
1084 | ok $w && $w =~ /Deprecated/, 'Verify that latin1 symbol in name gives warning'; | |
1085 | undef $w; | |
1086 | $name = "A\x{D1}O"; | |
1087 | eval "q(W) =~ /\\N{$name}/"; | |
1088 | ok ! $w, 'Verify that latin1 letter in name doesnt give warning'; | |
0f289c68 | 1089 | |
e425a60b YO |
1090 | } |
1091 | ||
1092 | ||
1093 | { | |
1094 | use charnames ':full'; | |
1095 | ||
1096 | ok 'aabc' !~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against aabc'; | |
1097 | ok 'a+bc' =~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against a+bc'; | |
1098 | ||
1099 | ok ' A B' =~ /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, | |
1100 | 'Intermixed named and unicode escapes'; | |
1101 | ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ | |
1102 | /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, | |
1103 | 'Intermixed named and unicode escapes'; | |
1104 | ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ | |
1105 | /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/, | |
0f289c68 | 1106 | 'Intermixed named and unicode escapes'; |
ff3f963a | 1107 | ok "\0" =~ /^\N{NULL}$/, 'Verify that \N{NULL} works; is not confused with an error'; |
e425a60b YO |
1108 | } |
1109 | ||
1110 | ||
1111 | { | |
1112 | our $brackets; | |
1113 | $brackets = qr{ | |
1114 | { (?> [^{}]+ | (??{ $brackets }) )* } | |
1115 | }x; | |
1116 | ||
1117 | ok "{b{c}d" !~ m/^((??{ $brackets }))/, "Bracket mismatch"; | |
1118 | ||
1119 | SKIP: { | |
1120 | our @stack = (); | |
1121 | my @expect = qw( | |
1122 | stuff1 | |
1123 | stuff2 | |
1124 | <stuff1>and<stuff2> | |
1125 | right | |
1126 | <right> | |
1127 | <<right>> | |
1128 | <<<right>>> | |
1129 | <<stuff1>and<stuff2>><<<<right>>>> | |
1130 | ); | |
1131 | ||
1132 | local $_ = '<<<stuff1>and<stuff2>><<<<right>>>>>'; | |
1133 | ok /^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/, | |
1134 | "Recursion matches"; | |
1135 | iseq @stack, @expect, "Right amount of matches" | |
1136 | or skip "Won't test individual results as count isn't equal", | |
1137 | 0 + @expect; | |
1138 | my $idx = 0; | |
1139 | foreach my $expect (@expect) { | |
1140 | iseq $stack [$idx], $expect, | |
1141 | "Expecting '$expect' at stack pos #$idx"; | |
1142 | $idx ++; | |
1143 | } | |
1144 | } | |
1145 | } | |
1146 | ||
1147 | ||
1148 | { | |
1149 | my $s = '123453456'; | |
1150 | $s =~ s/(?<digits>\d+)\k<digits>/$+{digits}/; | |
1151 | ok $s eq '123456', 'Named capture (angle brackets) s///'; | |
1152 | $s = '123453456'; | |
1153 | $s =~ s/(?'digits'\d+)\k'digits'/$+{digits}/; | |
0f289c68 | 1154 | ok $s eq '123456', 'Named capture (single quotes) s///'; |
e425a60b YO |
1155 | } |
1156 | ||
1157 | ||
1158 | { | |
1159 | my @ary = ( | |
1160 | pack('U', 0x00F1), # n-tilde | |
1161 | '_'.pack('U', 0x00F1), # _ + n-tilde | |
1162 | 'c'.pack('U', 0x0327), # c + cedilla | |
1163 | pack('U*', 0x00F1, 0x0327), # n-tilde + cedilla | |
e425a60b YO |
1164 | pack('U', 0x0391), # ALPHA |
1165 | pack('U', 0x0391).'2', # ALPHA + 2 | |
1166 | pack('U', 0x0391).'_', # ALPHA + _ | |
1167 | ); | |
1168 | ||
1169 | for my $uni (@ary) { | |
1170 | my ($r1, $c1, $r2, $c2) = eval qq { | |
1171 | use utf8; | |
1172 | scalar ("..foo foo.." =~ /(?'${uni}'foo) \\k'${uni}'/), | |
1173 | \$+{${uni}}, | |
1174 | scalar ("..bar bar.." =~ /(?<${uni}>bar) \\k<${uni}>/), | |
1175 | \$+{${uni}}; | |
1176 | }; | |
1177 | ok $r1, "Named capture UTF (?'')"; | |
1178 | ok defined $c1 && $c1 eq 'foo', "Named capture UTF \%+"; | |
1179 | ok $r2, "Named capture UTF (?<>)"; | |
1180 | ok defined $c2 && $c2 eq 'bar', "Named capture UTF \%+"; | |
1181 | } | |
1182 | } | |
1183 | ||
e425a60b YO |
1184 | { |
1185 | my $s = 'foo bar baz'; | |
1186 | my @res; | |
1187 | if ('1234' =~ /(?<A>1)(?<B>2)(?<A>3)(?<B>4)/) { | |
1188 | foreach my $name (sort keys(%-)) { | |
1189 | my $ary = $- {$name}; | |
1190 | foreach my $idx (0 .. $#$ary) { | |
1191 | push @res, "$name:$idx:$ary->[$idx]"; | |
1192 | } | |
1193 | } | |
1194 | } | |
1195 | my @expect = qw (A:0:1 A:1:3 B:0:2 B:1:4); | |
1196 | iseq "@res", "@expect", "Check %-"; | |
1197 | eval' | |
1198 | no warnings "uninitialized"; | |
1199 | print for $- {this_key_doesnt_exist}; | |
1200 | '; | |
1201 | ok !$@,'lvalue $- {...} should not throw an exception'; | |
1202 | } | |
1203 | ||
e425a60b YO |
1204 | { |
1205 | # \, breaks {3,4} | |
1206 | ok "xaaay" !~ /xa{3\,4}y/, '\, in a pattern'; | |
1207 | ok "xa{3,4}y" =~ /xa{3\,4}y/, '\, in a pattern'; | |
1208 | ||
1209 | # \c\ followed by _ | |
1210 | ok "x\c_y" !~ /x\c\_y/, '\_ in a pattern'; | |
1211 | ok "x\c\_y" =~ /x\c\_y/, '\_ in a pattern'; | |
1212 | ||
1213 | # \c\ followed by other characters | |
1214 | for my $c ("z", "\0", "!", chr(254), chr(256)) { | |
1215 | my $targ = "a\034$c"; | |
1216 | my $reg = "a\\c\\$c"; | |
1217 | ok eval ("qq/$targ/ =~ /$reg/"), "\\c\\ in pattern"; | |
1218 | } | |
1219 | } | |
1220 | ||
e425a60b YO |
1221 | { # Test the (*PRUNE) pattern |
1222 | our $count = 0; | |
1223 | 'aaab' =~ /a+b?(?{$count++})(*FAIL)/; | |
1224 | iseq $count, 9, "Expect 9 for no (*PRUNE)"; | |
1225 | $count = 0; | |
1226 | 'aaab' =~ /a+b?(*PRUNE)(?{$count++})(*FAIL)/; | |
1227 | iseq $count, 3, "Expect 3 with (*PRUNE)"; | |
1228 | local $_ = 'aaab'; | |
1229 | $count = 0; | |
1230 | 1 while /.(*PRUNE)(?{$count++})(*FAIL)/g; | |
1231 | iseq $count, 4, "/.(*PRUNE)/"; | |
1232 | $count = 0; | |
1233 | 'aaab' =~ /a+b?(??{'(*PRUNE)'})(?{$count++})(*FAIL)/; | |
1234 | iseq $count, 3, "Expect 3 with (*PRUNE)"; | |
1235 | local $_ = 'aaab'; | |
1236 | $count = 0; | |
1237 | 1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g; | |
1238 | iseq $count, 4, "/.(*PRUNE)/"; | |
1239 | } | |
1240 | ||
1241 | ||
1242 | { # Test the (*SKIP) pattern | |
1243 | our $count = 0; | |
1244 | 'aaab' =~ /a+b?(*SKIP)(?{$count++})(*FAIL)/; | |
1245 | iseq $count, 1, "Expect 1 with (*SKIP)"; | |
1246 | local $_ = 'aaab'; | |
1247 | $count = 0; | |
1248 | 1 while /.(*SKIP)(?{$count++})(*FAIL)/g; | |
1249 | iseq $count, 4, "/.(*SKIP)/"; | |
1250 | $_ = 'aaabaaab'; | |
1251 | $count = 0; | |
1252 | our @res = (); | |
1253 | 1 while /(a+b?)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; | |
1254 | iseq $count, 2, "Expect 2 with (*SKIP)"; | |
1255 | iseq "@res", "aaab aaab", "Adjacent (*SKIP) works as expected"; | |
1256 | } | |
1257 | ||
1258 | ||
1259 | { # Test the (*SKIP) pattern | |
1260 | our $count = 0; | |
1261 | 'aaab' =~ /a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/; | |
1262 | iseq $count, 1, "Expect 1 with (*SKIP)"; | |
1263 | local $_ = 'aaab'; | |
1264 | $count = 0; | |
1265 | 1 while /.(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/g; | |
1266 | iseq $count, 4, "/.(*SKIP)/"; | |
1267 | $_ = 'aaabaaab'; | |
1268 | $count = 0; | |
1269 | our @res = (); | |
1270 | 1 while /(a+b?)(*MARK:foo)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; | |
1271 | iseq $count, 2, "Expect 2 with (*SKIP)"; | |
1272 | iseq "@res", "aaab aaab", "Adjacent (*SKIP) works as expected"; | |
1273 | } | |
1274 | ||
1275 | ||
1276 | { # Test the (*SKIP) pattern | |
1277 | our $count = 0; | |
1278 | 'aaab' =~ /a*(*MARK:a)b?(*MARK:b)(*SKIP:a)(?{$count++})(*FAIL)/; | |
1279 | iseq $count, 3, "Expect 3 with *MARK:a)b?(*MARK:b)(*SKIP:a)"; | |
1280 | local $_ = 'aaabaaab'; | |
1281 | $count = 0; | |
1282 | our @res = (); | |
1283 | 1 while | |
1284 | /(a*(*MARK:a)b?)(*MARK:x)(*SKIP:a)(?{$count++; push @res,$1})(*FAIL)/g; | |
1285 | iseq $count, 5, "Expect 5 with (*MARK:a)b?)(*MARK:x)(*SKIP:a)"; | |
1286 | iseq "@res", "aaab b aaab b ", | |
1287 | "Adjacent (*MARK:a)b?)(*MARK:x)(*SKIP:a) works as expected"; | |
1288 | } | |
1289 | ||
1290 | ||
1291 | { # Test the (*COMMIT) pattern | |
1292 | our $count = 0; | |
1293 | 'aaabaaab' =~ /a+b?(*COMMIT)(?{$count++})(*FAIL)/; | |
1294 | iseq $count, 1, "Expect 1 with (*COMMIT)"; | |
1295 | local $_ = 'aaab'; | |
1296 | $count = 0; | |
1297 | 1 while /.(*COMMIT)(?{$count++})(*FAIL)/g; | |
1298 | iseq $count, 1, "/.(*COMMIT)/"; | |
1299 | $_ = 'aaabaaab'; | |
1300 | $count = 0; | |
1301 | our @res = (); | |
1302 | 1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g; | |
1303 | iseq $count, 1, "Expect 1 with (*COMMIT)"; | |
1304 | iseq "@res", "aaab", "Adjacent (*COMMIT) works as expected"; | |
1305 | } | |
1306 | ||
1307 | ||
1308 | { | |
1309 | # Test named commits and the $REGERROR var | |
1310 | our $REGERROR; | |
1311 | for my $name ('', ':foo') { | |
1312 | for my $pat ("(*PRUNE$name)", | |
1313 | ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", | |
0f289c68 | 1314 | "(*COMMIT$name)") { |
e425a60b YO |
1315 | for my $suffix ('(*FAIL)', '') { |
1316 | 'aaaab' =~ /a+b$pat$suffix/; | |
1317 | iseq $REGERROR, | |
1318 | ($suffix ? ($name ? 'foo' : "1") : ""), | |
1319 | "Test $pat and \$REGERROR $suffix"; | |
1320 | } | |
1321 | } | |
1322 | } | |
1323 | } | |
1324 | ||
1325 | ||
1326 | { | |
1327 | # Test named commits and the $REGERROR var | |
1328 | package Fnorble; | |
1329 | our $REGERROR; | |
1330 | for my $name ('', ':foo') { | |
1331 | for my $pat ("(*PRUNE$name)", | |
1332 | ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", | |
0f289c68 | 1333 | "(*COMMIT$name)") { |
e425a60b YO |
1334 | for my $suffix ('(*FAIL)','') { |
1335 | 'aaaab' =~ /a+b$pat$suffix/; | |
1336 | ::iseq $REGERROR, | |
1337 | ($suffix ? ($name ? 'foo' : "1") : ""), | |
1338 | "Test $pat and \$REGERROR $suffix"; | |
1339 | } | |
1340 | } | |
0f289c68 YO |
1341 | } |
1342 | } | |
e425a60b YO |
1343 | |
1344 | ||
1345 | { | |
1346 | # Test named commits and the $REGERROR var | |
1347 | local $Message = '$REGERROR'; | |
1348 | our $REGERROR; | |
1349 | for my $word (qw (bar baz bop)) { | |
1350 | $REGERROR = ""; | |
1351 | "aaaaa$word" =~ | |
1352 | /a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/; | |
1353 | iseq $REGERROR, $word; | |
0f289c68 | 1354 | } |
e425a60b YO |
1355 | } |
1356 | ||
e425a60b YO |
1357 | { |
1358 | #Mindnumbingly simple test of (*THEN) | |
1359 | for ("ABC","BAX") { | |
1360 | ok /A (*THEN) X | B (*THEN) C/x, "Simple (*THEN) test"; | |
1361 | } | |
1362 | } | |
1363 | ||
1364 | ||
1365 | { | |
1366 | local $Message = "Relative Recursion"; | |
1367 | my $parens = qr/(\((?:[^()]++|(?-1))*+\))/; | |
1368 | local $_ = 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; | |
1369 | my ($all, $one, $two) = ('', '', ''); | |
9d45b377 | 1370 | ok m/foo $parens \s* \+ \s* bar $parens/x; |
e425a60b YO |
1371 | iseq $1, '((2*3)+4-3)'; |
1372 | iseq $2, '(2*(3+4)-1*(2-3))'; | |
1373 | iseq $&, 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; | |
1374 | iseq $&, $_; | |
1375 | } | |
1376 | ||
1377 | { | |
1378 | my $spaces=" "; | |
1379 | local $_ = join 'bar', $spaces, $spaces; | |
1380 | our $count = 0; | |
1381 | s/(?>\s+bar)(?{$count++})//g; | |
1382 | iseq $_, $spaces, "SUSPEND final string"; | |
1383 | iseq $count, 1, "Optimiser should have prevented more than one match"; | |
1384 | } | |
1385 | ||
e425a60b YO |
1386 | |
1387 | { | |
1388 | # From Message-ID: <877ixs6oa6.fsf@k75.linux.bogus> | |
1389 | my $dow_name = "nada"; | |
1390 | my $parser = "(\$dow_name) = \$time_string =~ /(D\x{e9}\\ " . | |
1391 | "C\x{e9}adaoin|D\x{e9}\\ Sathairn|\\w+|\x{100})/"; | |
1392 | my $time_string = "D\x{e9} C\x{e9}adaoin"; | |
1393 | eval $parser; | |
1394 | ok !$@, "Test Eval worked"; | |
1395 | iseq $dow_name, $time_string, "UTF-8 trie common prefix extraction"; | |
1396 | } | |
1397 | ||
1398 | ||
1399 | { | |
1400 | my $v; | |
1401 | ($v = 'bar') =~ /(\w+)/g; | |
1402 | $v = 'foo'; | |
1403 | iseq "$1", 'bar', '$1 is safe after /g - may fail due ' . | |
1404 | 'to specialized config in pp_hot.c' | |
1405 | } | |
1406 | ||
1407 | ||
1408 | { | |
1409 | local $Message = "http://nntp.perl.org/group/perl.perl5.porters/118663"; | |
1410 | my $qr_barR1 = qr/(bar)\g-1/; | |
1411 | ok "foobarbarxyz" =~ $qr_barR1; | |
1412 | ok "foobarbarxyz" =~ qr/foo${qr_barR1}xyz/; | |
1413 | ok "foobarbarxyz" =~ qr/(foo)${qr_barR1}xyz/; | |
1414 | ok "foobarbarxyz" =~ qr/(foo)(bar)\g{-1}xyz/; | |
1415 | ok "foobarbarxyz" =~ qr/(foo${qr_barR1})xyz/; | |
1416 | ok "foobarbarxyz" =~ qr/(foo(bar)\g{-1})xyz/; | |
0f289c68 | 1417 | } |
e425a60b | 1418 | |
e425a60b YO |
1419 | { |
1420 | local $Message = '$REGMARK'; | |
1421 | our @r = (); | |
1422 | our ($REGMARK, $REGERROR); | |
1423 | ok 'foofoo' =~ /foo (*MARK:foo) (?{push @r,$REGMARK}) /x; | |
0f289c68 | 1424 | iseq "@r","foo"; |
e425a60b YO |
1425 | iseq $REGMARK, "foo"; |
1426 | ok 'foofoo' !~ /foo (*MARK:foo) (*FAIL) /x; | |
1427 | ok !$REGMARK; | |
1428 | iseq $REGERROR, 'foo'; | |
1429 | } | |
1430 | ||
1431 | ||
1432 | { | |
1433 | local $Message = '\K test'; | |
1434 | my $x; | |
1435 | $x = "abc.def.ghi.jkl"; | |
1436 | $x =~ s/.*\K\..*//; | |
1437 | iseq $x, "abc.def.ghi"; | |
0f289c68 | 1438 | |
e425a60b YO |
1439 | $x = "one two three four"; |
1440 | $x =~ s/o+ \Kthree//g; | |
1441 | iseq $x, "one two four"; | |
0f289c68 | 1442 | |
e425a60b YO |
1443 | $x = "abcde"; |
1444 | $x =~ s/(.)\K/$1/g; | |
1445 | iseq $x, "aabbccddee"; | |
1446 | } | |
1447 | ||
1448 | ||
1449 | { | |
1450 | sub kt { | |
1451 | return '4' if $_[0] eq '09028623'; | |
1452 | } | |
1453 | # Nested EVAL using PL_curpm (via $1 or friends) | |
1454 | my $re; | |
1455 | our $grabit = qr/ ([0-6][0-9]{7}) (??{ kt $1 }) [890] /x; | |
1456 | $re = qr/^ ( (??{ $grabit }) ) $ /x; | |
1457 | my @res = '0902862349' =~ $re; | |
1458 | iseq join ("-", @res), "0902862349", | |
1459 | 'PL_curpm is set properly on nested eval'; | |
1460 | ||
1461 | our $qr = qr/ (o) (??{ $1 }) /x; | |
1462 | ok 'boob'=~/( b (??{ $qr }) b )/x && 1, "PL_curpm, nested eval"; | |
1463 | } | |
1464 | ||
1465 | ||
1466 | { | |
1467 | use charnames ":full"; | |
1468 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "I =~ Alphabetic"; | |
1469 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Uppercase}/, "I =~ Uppercase"; | |
1470 | ok "\N{ROMAN NUMERAL ONE}" !~ /\p{Lowercase}/, "I !~ Lowercase"; | |
1471 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "I =~ ID_Start"; | |
1472 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "I =~ ID_Continue"; | |
1473 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "i =~ Alphabetic"; | |
1474 | ok "\N{SMALL ROMAN NUMERAL ONE}" !~ /\p{Uppercase}/, "i !~ Uppercase"; | |
0ab1ff8e KW |
1475 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Uppercase}/i, "i =~ Uppercase under /i"; |
1476 | ok "\N{SMALL ROMAN NUMERAL ONE}" !~ /\p{Titlecase}/, "i !~ Titlecase"; | |
1477 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Titlecase}/i, "i =~ Titlecase under /i"; | |
1478 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Lowercase}/i, "I =~ Lowercase under | |
1479 | /i"; | |
e425a60b YO |
1480 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Lowercase}/, "i =~ Lowercase"; |
1481 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "i =~ ID_Start"; | |
1482 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "i =~ ID_Continue" | |
1483 | } | |
1484 | ||
0ab1ff8e KW |
1485 | { # More checking that /i works on the few properties that it makes a |
1486 | # difference. Uppercase, Lowercase, and Titlecase were done in the | |
1487 | # block above | |
1488 | ok "A" =~ /\p{PosixUpper}/, "A =~ PosixUpper"; | |
1489 | ok "A" =~ /\p{PosixUpper}/i, "A =~ PosixUpper under /i"; | |
1490 | ok "A" !~ /\p{PosixLower}/, "A !~ PosixLower"; | |
1491 | ok "A" =~ /\p{PosixLower}/i, "A =~ PosixLower under /i"; | |
1492 | ok "a" !~ /\p{PosixUpper}/, "a !~ PosixUpper"; | |
1493 | ok "a" =~ /\p{PosixUpper}/i, "a =~ PosixUpper under /i"; | |
1494 | ok "a" =~ /\p{PosixLower}/, "a =~ PosixLower"; | |
1495 | ok "a" =~ /\p{PosixLower}/i, "a =~ PosixLower under /i"; | |
1496 | ||
1497 | ok "\xC0" =~ /\p{XPosixUpper}/, "\\xC0 =~ XPosixUpper"; | |
1498 | ok "\xC0" =~ /\p{XPosixUpper}/i, "\\xC0 =~ XPosixUpper under /i"; | |
1499 | ok "\xC0" !~ /\p{XPosixLower}/, "\\xC0 !~ XPosixLower"; | |
1500 | ok "\xC0" =~ /\p{XPosixLower}/i, "\\xC0 =~ XPosixLower under /i"; | |
1501 | ok "\xE0" !~ /\p{XPosixUpper}/, "\\xE0 !~ XPosixUpper"; | |
1502 | ok "\xE0" =~ /\p{XPosixUpper}/i, "\\xE0 =~ XPosixUpper under /i"; | |
1503 | ok "\xE0" =~ /\p{XPosixLower}/, "\\xE0 =~ XPosixLower"; | |
1504 | ok "\xE0" =~ /\p{XPosixLower}/i, "\\xE0 =~ XPosixLower under /i"; | |
1505 | ||
1506 | ok "\xC0" =~ /\p{UppercaseLetter}/, "\\xC0 =~ UppercaseLetter"; | |
1507 | ok "\xC0" =~ /\p{UppercaseLetter}/i, "\\xC0 =~ UppercaseLetter under /i"; | |
1508 | ok "\xC0" !~ /\p{LowercaseLetter}/, "\\xC0 !~ LowercaseLetter"; | |
1509 | ok "\xC0" =~ /\p{LowercaseLetter}/i, "\\xC0 =~ LowercaseLetter under /i"; | |
1510 | ok "\xC0" !~ /\p{TitlecaseLetter}/, "\\xC0 !~ TitlecaseLetter"; | |
1511 | ok "\xC0" =~ /\p{TitlecaseLetter}/i, "\\xC0 =~ TitlecaseLetter under /i"; | |
1512 | ok "\xE0" !~ /\p{UppercaseLetter}/, "\\xE0 !~ UppercaseLetter"; | |
1513 | ok "\xE0" =~ /\p{UppercaseLetter}/i, "\\xE0 =~ UppercaseLetter under /i"; | |
1514 | ok "\xE0" =~ /\p{LowercaseLetter}/, "\\xE0 =~ LowercaseLetter"; | |
1515 | ok "\xE0" =~ /\p{LowercaseLetter}/i, "\\xE0 =~ LowercaseLetter under /i"; | |
1516 | ok "\xE0" !~ /\p{TitlecaseLetter}/, "\\xE0 !~ TitlecaseLetter"; | |
1517 | ok "\xE0" =~ /\p{TitlecaseLetter}/i, "\\xE0 =~ TitlecaseLetter under /i"; | |
1518 | ok "\x{1C5}" !~ /\p{UppercaseLetter}/, "\\x{1C5} !~ UppercaseLetter"; | |
1519 | ok "\x{1C5}" =~ /\p{UppercaseLetter}/i, "\\x{1C5} =~ UppercaseLetter under /i"; | |
1520 | ok "\x{1C5}" !~ /\p{LowercaseLetter}/, "\\x{1C5} !~ LowercaseLetter"; | |
1521 | ok "\x{1C5}" =~ /\p{LowercaseLetter}/i, "\\x{1C5} =~ LowercaseLetter under /i"; | |
1522 | ok "\x{1C5}" =~ /\p{TitlecaseLetter}/, "\\x{1C5} =~ TitlecaseLetter"; | |
1523 | ok "\x{1C5}" =~ /\p{TitlecaseLetter}/i, "\\x{1C5} =~ TitlecaseLetter under /i"; | |
1524 | } | |
1525 | ||
e425a60b YO |
1526 | |
1527 | { | |
1528 | # requirement of Unicode Technical Standard #18, 1.7 Code Points | |
1529 | # cf. http://www.unicode.org/reports/tr18/#Supplementary_Characters | |
1530 | for my $u (0x7FF, 0x800, 0xFFFF, 0x10000) { | |
1531 | no warnings 'utf8'; # oops | |
1532 | my $c = chr $u; | |
1533 | my $x = sprintf '%04X', $u; | |
1534 | ok "A${c}B" =~ /A[\0-\x{10000}]B/, "Unicode range - $x"; | |
1535 | } | |
1536 | } | |
1537 | ||
1538 | ||
1539 | { | |
1540 | my $res=""; | |
1541 | ||
1542 | if ('1' =~ /(?|(?<digit>1)|(?<digit>2))/) { | |
1543 | $res = "@{$- {digit}}"; | |
1544 | } | |
1545 | iseq $res, "1", | |
1546 | "Check that (?|...) doesnt cause dupe entries in the names array"; | |
0f289c68 | 1547 | |
e425a60b YO |
1548 | $res = ""; |
1549 | if ('11' =~ /(?|(?<digit>1)|(?<digit>2))(?&digit)/) { | |
1550 | $res = "@{$- {digit}}"; | |
1551 | } | |
1552 | iseq $res, "1", "Check that (?&..) to a buffer inside " . | |
1553 | "a (?|...) goes to the leftmost"; | |
1554 | } | |
1555 | ||
1556 | ||
1557 | { | |
1558 | use warnings; | |
1559 | local $Message = "ASCII pattern that really is UTF-8"; | |
1560 | my @w; | |
1561 | local $SIG {__WARN__} = sub {push @w, "@_"}; | |
0f289c68 | 1562 | my $c = qq (\x{DF}); |
e425a60b YO |
1563 | ok $c =~ /${c}|\x{100}/; |
1564 | ok @w == 0; | |
0f289c68 | 1565 | } |
e425a60b YO |
1566 | |
1567 | ||
1568 | { | |
1569 | local $Message = "Corruption of match results of qr// across scopes"; | |
1570 | my $qr = qr/(fo+)(ba+r)/; | |
1571 | 'foobar' =~ /$qr/; | |
1572 | iseq "$1$2", "foobar"; | |
1573 | { | |
1574 | 'foooooobaaaaar' =~ /$qr/; | |
0f289c68 | 1575 | iseq "$1$2", 'foooooobaaaaar'; |
e425a60b YO |
1576 | } |
1577 | iseq "$1$2", "foobar"; | |
1578 | } | |
1579 | ||
1580 | ||
1581 | { | |
1582 | local $Message = "HORIZWS"; | |
1583 | local $_ = "\t \r\n \n \t".chr(11)."\n"; | |
1584 | s/\H/H/g; | |
1585 | s/\h/h/g; | |
1586 | iseq $_, "hhHHhHhhHH"; | |
1587 | $_ = "\t \r\n \n \t" . chr (11) . "\n"; | |
1588 | utf8::upgrade ($_); | |
1589 | s/\H/H/g; | |
1590 | s/\h/h/g; | |
1591 | iseq $_, "hhHHhHhhHH"; | |
0f289c68 | 1592 | } |
e425a60b YO |
1593 | |
1594 | ||
1595 | { | |
1596 | local $Message = "Various whitespace special patterns"; | |
1597 | my @h = map {chr $_} 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, | |
1598 | 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, | |
1599 | 0x2007, 0x2008, 0x2009, 0x200a, 0x202f, 0x205f, | |
1600 | 0x3000; | |
1601 | my @v = map {chr $_} 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, | |
1602 | 0x2029; | |
1603 | my @lb = ("\x0D\x0A", map {chr $_} 0x0A .. 0x0D, 0x85, 0x2028, 0x2029); | |
1604 | foreach my $t ([\@h, qr/\h/, qr/\h+/], | |
1605 | [\@v, qr/\v/, qr/\v+/], | |
1606 | [\@lb, qr/\R/, qr/\R+/],) { | |
1607 | my $ary = shift @$t; | |
1608 | foreach my $pat (@$t) { | |
1609 | foreach my $str (@$ary) { | |
1610 | ok $str =~ /($pat)/, $pat; | |
1611 | iseq $1, $str, $pat; | |
1612 | utf8::upgrade ($str); | |
1613 | ok $str =~ /($pat)/, "Upgraded string - $pat"; | |
1614 | iseq $1, $str, "Upgraded string - $pat"; | |
1615 | } | |
1616 | } | |
1617 | } | |
1618 | } | |
1619 | ||
1620 | ||
1621 | { | |
1622 | local $Message = "Check that \\xDF match properly in its various forms"; | |
1623 | # Test that \xDF matches properly. this is pretty hacky stuff, | |
1624 | # but its actually needed. The malarky with '-' is to prevent | |
1625 | # compilation caching from playing any role in the test. | |
1626 | my @df = (chr (0xDF), '-', chr (0xDF)); | |
1627 | utf8::upgrade ($df [2]); | |
1628 | my @strs = ('ss', 'sS', 'Ss', 'SS', chr (0xDF)); | |
1629 | my @ss = map {("$_", "$_")} @strs; | |
1630 | utf8::upgrade ($ss [$_ * 2 + 1]) for 0 .. $#strs; | |
1631 | ||
1632 | for my $ssi (0 .. $#ss) { | |
1633 | for my $dfi (0 .. $#df) { | |
1634 | my $pat = $df [$dfi]; | |
1635 | my $str = $ss [$ssi]; | |
1636 | my $utf_df = ($dfi > 1) ? 'utf8' : ''; | |
1637 | my $utf_ss = ($ssi % 2) ? 'utf8' : ''; | |
1638 | (my $sstr = $str) =~ s/\xDF/\\xDF/; | |
1639 | ||
1640 | if ($utf_df || $utf_ss || length ($ss [$ssi]) == 1) { | |
1641 | my $ret = $str =~ /$pat/i; | |
1642 | next if $pat eq '-'; | |
1643 | ok $ret, "\"$sstr\" =~ /\\xDF/i " . | |
1644 | "(str is @{[$utf_ss||'latin']}, pat is " . | |
1645 | "@{[$utf_df||'latin']})"; | |
1646 | } | |
1647 | else { | |
1648 | my $ret = $str !~ /$pat/i; | |
1649 | next if $pat eq '-'; | |
1650 | ok $ret, "\"$sstr\" !~ /\\xDF/i " . | |
1651 | "(str is @{[$utf_ss||'latin']}, pat is " . | |
1652 | "@{[$utf_df||'latin']})"; | |
1653 | } | |
1654 | } | |
1655 | } | |
1656 | } | |
1657 | ||
1658 | ||
1659 | { | |
1660 | local $Message = "BBC(Bleadperl Breaks CPAN) Today: String::Multibyte"; | |
1661 | my $re = qr/(?:[\x00-\xFF]{4})/; | |
1662 | my $hyp = "\0\0\0-"; | |
1663 | my $esc = "\0\0\0\\"; | |
1664 | ||
1665 | my $str = "$esc$hyp$hyp$esc$esc"; | |
1666 | my @a = ($str =~ /\G(?:\Q$esc$esc\E|\Q$esc$hyp\E|$re)/g); | |
1667 | ||
1668 | iseq @a,3; | |
1669 | local $" = "="; | |
1670 | iseq "@a","$esc$hyp=$hyp=$esc$esc"; | |
1671 | } | |
1672 | ||
1673 | ||
1674 | { | |
1675 | # Test for keys in %+ and %- | |
1676 | local $Message = 'Test keys in %+ and %-'; | |
1677 | no warnings 'uninitialized'; | |
1678 | my $_ = "abcdef"; | |
1679 | /(?<foo>a)|(?<foo>b)/; | |
1680 | iseq ((join ",", sort keys %+), "foo"); | |
1681 | iseq ((join ",", sort keys %-), "foo"); | |
1682 | iseq ((join ",", sort values %+), "a"); | |
1683 | iseq ((join ",", sort map "@$_", values %-), "a "); | |
1684 | /(?<bar>a)(?<bar>b)(?<quux>.)/; | |
1685 | iseq ((join ",", sort keys %+), "bar,quux"); | |
1686 | iseq ((join ",", sort keys %-), "bar,quux"); | |
1687 | iseq ((join ",", sort values %+), "a,c"); # leftmost | |
1688 | iseq ((join ",", sort map "@$_", values %-), "a b,c"); | |
1689 | /(?<un>a)(?<deux>c)?/; # second buffer won't capture | |
1690 | iseq ((join ",", sort keys %+), "un"); | |
1691 | iseq ((join ",", sort keys %-), "deux,un"); | |
1692 | iseq ((join ",", sort values %+), "a"); | |
1693 | iseq ((join ",", sort map "@$_", values %-), ",a"); | |
1694 | } | |
1695 | ||
1696 | ||
1697 | { | |
1698 | # length() on captures, the numbered ones end up in Perl_magic_len | |
1699 | my $_ = "aoeu \xe6var ook"; | |
1700 | /^ \w+ \s (?<eek>\S+)/x; | |
1701 | ||
1702 | iseq length ($`), 0, q[length $`]; | |
1703 | iseq length ($'), 4, q[length $']; | |
1704 | iseq length ($&), 9, q[length $&]; | |
1705 | iseq length ($1), 4, q[length $1]; | |
1706 | iseq length ($+{eek}), 4, q[length $+{eek} == length $1]; | |
1707 | } | |
1708 | ||
1709 | ||
1710 | { | |
1711 | my $ok = -1; | |
1712 | ||
1713 | $ok = exists ($-{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; | |
1714 | iseq $ok, 1, '$-{x} exists after "bar"=~/(?<x>foo)|bar/'; | |
1715 | iseq scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'; | |
1716 | iseq scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'; | |
1717 | ||
1718 | $ok = -1; | |
1719 | $ok = exists ($+{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; | |
1720 | iseq $ok, 0, '$+{x} not exists after "bar"=~/(?<x>foo)|bar/'; | |
1721 | iseq scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'; | |
1722 | iseq scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'; | |
1723 | ||
1724 | $ok = -1; | |
1725 | $ok = exists ($-{x}) ? 1 : 0 if 'foo' =~ /(?<x>foo)|bar/; | |
1726 | iseq $ok, 1, '$-{x} exists after "foo"=~/(?<x>foo)|bar/'; | |
1727 | iseq scalar (%+), 1, 'scalar %+ == 1 after "foo"=~/(?<x>foo)|bar/'; | |
1728 | iseq scalar (%-), 1, 'scalar %- == 1 after "foo"=~/(?<x>foo)|bar/'; | |
1729 | ||
1730 | $ok = -1; | |
1731 | $ok = exists ($+{x}) ? 1 : 0 if 'foo'=~/(?<x>foo)|bar/; | |
1732 | iseq $ok, 1, '$+{x} exists after "foo"=~/(?<x>foo)|bar/'; | |
1733 | } | |
1734 | ||
1735 | ||
1736 | { | |
1737 | local $_; | |
1738 | ($_ = 'abc') =~ /(abc)/g; | |
0f289c68 | 1739 | $_ = '123'; |
e425a60b YO |
1740 | iseq "$1", 'abc', "/g leads to unsafe match vars: $1"; |
1741 | } | |
1742 | ||
1743 | ||
1744 | { | |
1745 | local $Message = 'Message-ID: <20070818091501.7eff4831@r2d2>'; | |
1746 | my $str = ""; | |
1747 | for (0 .. 5) { | |
1748 | my @x; | |
1749 | $str .= "@x"; # this should ALWAYS be the empty string | |
1750 | 'a' =~ /(a|)/; | |
1751 | push @x, 1; | |
1752 | } | |
1753 | iseq length ($str), 0, "Trie scope error, string should be empty"; | |
1754 | $str = ""; | |
1755 | my @foo = ('a') x 5; | |
1756 | for (@foo) { | |
1757 | my @bar; | |
1758 | $str .= "@bar"; | |
1759 | s/a|/push @bar, 1/e; | |
1760 | } | |
1761 | iseq length ($str), 0, "Trie scope error, string should be empty"; | |
1762 | } | |
1763 | ||
1764 | ||
1765 | { | |
e425a60b | 1766 | # more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding |
0f289c68 YO |
1767 | for my $chr (160 .. 255) { |
1768 | my $chr_byte = chr($chr); | |
1769 | my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8); | |
1770 | my $rx = qr{$chr_byte|X}i; | |
1771 | ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr"); | |
1772 | } | |
e425a60b YO |
1773 | } |
1774 | ||
1775 | { | |
e425a60b YO |
1776 | our $a = 3; "" =~ /(??{ $a })/; |
1777 | our $b = $a; | |
1778 | iseq $b, $a, "Copy of scalar used for postponed subexpression"; | |
1779 | } | |
1780 | ||
1781 | ||
1782 | { | |
e425a60b YO |
1783 | our @ctl_n = (); |
1784 | our @plus = (); | |
1785 | our $nested_tags; | |
1786 | $nested_tags = qr{ | |
1787 | < | |
1788 | (\w+) | |
1789 | (?{ | |
1790 | push @ctl_n,$^N; | |
1791 | push @plus,$+; | |
1792 | }) | |
1793 | > | |
1794 | (??{$nested_tags})* | |
1795 | </\s* \w+ \s*> | |
1796 | }x; | |
1797 | ||
1798 | my $match = '<bla><blubb></blubb></bla>' =~ m/^$nested_tags$/; | |
1799 | ok $match, 'nested construct matches'; | |
1800 | iseq "@ctl_n", "bla blubb", '$^N inside of (?{}) works as expected'; | |
1801 | iseq "@plus", "bla blubb", '$+ inside of (?{}) works as expected'; | |
1802 | } | |
1803 | ||
1804 | ||
e425a60b YO |
1805 | SKIP: { |
1806 | # XXX: This set of tests is essentially broken, POSIX character classes | |
0f289c68 | 1807 | # should not have differing definitions under Unicode. |
e425a60b YO |
1808 | # There are property names for that. |
1809 | skip "Tests assume ASCII", 4 unless $IS_ASCII; | |
1810 | ||
1811 | my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/} | |
1812 | map {chr} 0x20 .. 0x7f; | |
1813 | iseq join ('', @notIsPunct), '$+<=>^`|~', | |
93f09d7b | 1814 | '[:punct:] disagrees with IsPunct on Symbols'; |
e425a60b YO |
1815 | |
1816 | my @isPrint = grep {not /[[:print:]]/ and /\p{IsPrint}/} | |
1817 | map {chr} 0 .. 0x1f, 0x7f .. 0x9f; | |
99870f4d KW |
1818 | iseq join ('', @isPrint), "", |
1819 | 'IsPrint agrees with [:print:] on control characters'; | |
e425a60b YO |
1820 | |
1821 | my @isPunct = grep {/[[:punct:]]/ != /\p{IsPunct}/} | |
1822 | map {chr} 0x80 .. 0xff; | |
0f289c68 | 1823 | iseq join ('', @isPunct), "\xa1\xab\xb7\xbb\xbf", # ¡ « · » ¿ |
e425a60b YO |
1824 | 'IsPunct disagrees with [:punct:] outside ASCII'; |
1825 | ||
1826 | my @isPunctLatin1 = eval q { | |
1827 | use encoding 'latin1'; | |
1828 | grep {/[[:punct:]]/ != /\p{IsPunct}/} map {chr} 0x80 .. 0xff; | |
1829 | }; | |
1830 | skip "Eval failed ($@)", 1 if $@; | |
1831 | skip "PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS set to 0", 1 | |
1832 | if !$ENV{PERL_TEST_LEGACY_POSIX_CC}; | |
0f289c68 | 1833 | iseq join ('', @isPunctLatin1), '', |
e425a60b | 1834 | 'IsPunct agrees with [:punct:] with explicit Latin1'; |
0f289c68 | 1835 | } |
d1c771f5 B |
1836 | |
1837 | ||
1838 | { | |
1839 | # Tests for [#perl 71942] | |
1840 | our $count_a; | |
1841 | our $count_b; | |
1842 | ||
1843 | my $c = 0; | |
1844 | for my $re ( | |
1845 | # [ | |
1846 | # should match?, | |
1847 | # input string, | |
1848 | # re 1, | |
1849 | # re 2, | |
1850 | # expected values of count_a and count_b, | |
1851 | # ] | |
1852 | [ | |
1853 | 0, | |
1854 | "xababz", | |
1855 | qr/a+(?{$count_a++})b?(*COMMIT)(*FAIL)/, | |
1856 | qr/a+(?{$count_b++})b?(*COMMIT)z/, | |
1857 | 1, | |
1858 | ], | |
1859 | [ | |
1860 | 0, | |
1861 | "xababz", | |
1862 | qr/a+(?{$count_a++})b?(*COMMIT)\s*(*FAIL)/, | |
1863 | qr/a+(?{$count_b++})b?(*COMMIT)\s*z/, | |
1864 | 1, | |
1865 | ], | |
1866 | [ | |
1867 | 0, | |
1868 | "xababz", | |
1869 | qr/a+(?{$count_a++})(?:b|)?(*COMMIT)(*FAIL)/, | |
1870 | qr/a+(?{$count_b++})(?:b|)?(*COMMIT)z/, | |
1871 | 1, | |
1872 | ], | |
1873 | [ | |
1874 | 0, | |
1875 | "xababz", | |
1876 | qr/a+(?{$count_a++})b{0,6}(*COMMIT)(*FAIL)/, | |
1877 | qr/a+(?{$count_b++})b{0,6}(*COMMIT)z/, | |
1878 | 1, | |
1879 | ], | |
1880 | [ | |
1881 | 0, | |
1882 | "xabcabcz", | |
1883 | qr/a+(?{$count_a++})(bc){0,6}(*COMMIT)(*FAIL)/, | |
1884 | qr/a+(?{$count_b++})(bc){0,6}(*COMMIT)z/, | |
1885 | 1, | |
1886 | ], | |
1887 | [ | |
1888 | 0, | |
1889 | "xabcabcz", | |
1890 | qr/a+(?{$count_a++})(bc*){0,6}(*COMMIT)(*FAIL)/, | |
1891 | qr/a+(?{$count_b++})(bc*){0,6}(*COMMIT)z/, | |
1892 | 1, | |
1893 | ], | |
1894 | ||
1895 | ||
1896 | [ | |
1897 | 0, | |
1898 | "aaaabtz", | |
1899 | qr/a+(?{$count_a++})b?(*PRUNE)(*FAIL)/, | |
1900 | qr/a+(?{$count_b++})b?(*PRUNE)z/, | |
1901 | 4, | |
1902 | ], | |
1903 | [ | |
1904 | 0, | |
1905 | "aaaabtz", | |
1906 | qr/a+(?{$count_a++})b?(*PRUNE)\s*(*FAIL)/, | |
1907 | qr/a+(?{$count_b++})b?(*PRUNE)\s*z/, | |
1908 | 4, | |
1909 | ], | |
1910 | [ | |
1911 | 0, | |
1912 | "aaaabtz", | |
1913 | qr/a+(?{$count_a++})(?:b|)(*PRUNE)(*FAIL)/, | |
1914 | qr/a+(?{$count_b++})(?:b|)(*PRUNE)z/, | |
1915 | 4, | |
1916 | ], | |
1917 | [ | |
1918 | 0, | |
1919 | "aaaabtz", | |
1920 | qr/a+(?{$count_a++})b{0,6}(*PRUNE)(*FAIL)/, | |
1921 | qr/a+(?{$count_b++})b{0,6}(*PRUNE)z/, | |
1922 | 4, | |
1923 | ], | |
1924 | [ | |
1925 | 0, | |
1926 | "aaaabctz", | |
1927 | qr/a+(?{$count_a++})(bc){0,6}(*PRUNE)(*FAIL)/, | |
1928 | qr/a+(?{$count_b++})(bc){0,6}(*PRUNE)z/, | |
1929 | 4, | |
1930 | ], | |
1931 | [ | |
1932 | 0, | |
1933 | "aaaabctz", | |
1934 | qr/a+(?{$count_a++})(bc*){0,6}(*PRUNE)(*FAIL)/, | |
1935 | qr/a+(?{$count_b++})(bc*){0,6}(*PRUNE)z/, | |
1936 | 4, | |
1937 | ], | |
1938 | ||
1939 | [ | |
1940 | 0, | |
1941 | "aaabaaab", | |
1942 | qr/a+(?{$count_a++;})b?(*SKIP)(*FAIL)/, | |
1943 | qr/a+(?{$count_b++;})b?(*SKIP)z/, | |
1944 | 2, | |
1945 | ], | |
1946 | [ | |
1947 | 0, | |
1948 | "aaabaaab", | |
1949 | qr/a+(?{$count_a++;})b?(*SKIP)\s*(*FAIL)/, | |
1950 | qr/a+(?{$count_b++;})b?(*SKIP)\s*z/, | |
1951 | 2, | |
1952 | ], | |
1953 | [ | |
1954 | 0, | |
1955 | "aaabaaab", | |
1956 | qr/a+(?{$count_a++;})(?:b|)(*SKIP)(*FAIL)/, | |
1957 | qr/a+(?{$count_b++;})(?:b|)(*SKIP)z/, | |
1958 | 2, | |
1959 | ], | |
1960 | [ | |
1961 | 0, | |
1962 | "aaabaaab", | |
1963 | qr/a+(?{$count_a++;})b{0,6}(*SKIP)(*FAIL)/, | |
1964 | qr/a+(?{$count_b++;})b{0,6}(*SKIP)z/, | |
1965 | 2, | |
1966 | ], | |
1967 | [ | |
1968 | 0, | |
1969 | "aaabcaaabc", | |
1970 | qr/a+(?{$count_a++;})(bc){0,6}(*SKIP)(*FAIL)/, | |
1971 | qr/a+(?{$count_b++;})(bc){0,6}(*SKIP)z/, | |
1972 | 2, | |
1973 | ], | |
1974 | [ | |
1975 | 0, | |
1976 | "aaabcaaabc", | |
1977 | qr/a+(?{$count_a++;})(bc*){0,6}(*SKIP)(*FAIL)/, | |
1978 | qr/a+(?{$count_b++;})(bc*){0,6}(*SKIP)z/, | |
1979 | 2, | |
1980 | ], | |
1981 | ||
1982 | ||
1983 | [ | |
1984 | 0, | |
1985 | "aaddbdaabyzc", | |
1986 | qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
1987 | qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) z \s* c \1 /x, | |
1988 | 4, | |
1989 | ], | |
1990 | [ | |
1991 | 0, | |
1992 | "aaddbdaabyzc", | |
1993 | qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) \s* (*FAIL) \s* c \1 /x, | |
1994 | qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) \s* z \s* c \1 /x, | |
1995 | 4, | |
1996 | ], | |
1997 | [ | |
1998 | 0, | |
1999 | "aaddbdaabyzc", | |
2000 | qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? (?:b|) (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2001 | qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? (?:b|) (*SKIP:T1) z \s* c \1 /x, | |
2002 | 4, | |
2003 | ], | |
2004 | [ | |
2005 | 0, | |
2006 | "aaddbdaabyzc", | |
2007 | qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? b{0,6} (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2008 | qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? b{0,6} (*SKIP:T1) z \s* c \1 /x, | |
2009 | 4, | |
2010 | ], | |
2011 | [ | |
2012 | 0, | |
2013 | "aaddbcdaabcyzc", | |
2014 | qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? (bc){0,6} (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2015 | qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? (bc){0,6} (*SKIP:T1) z \s* c \1 /x, | |
2016 | 4, | |
2017 | ], | |
2018 | [ | |
2019 | 0, | |
2020 | "aaddbcdaabcyzc", | |
2021 | qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? (bc*){0,6} (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2022 | qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? (bc*){0,6} (*SKIP:T1) z \s* c \1 /x, | |
2023 | 4, | |
2024 | ], | |
2025 | ||
2026 | ||
2027 | [ | |
2028 | 0, | |
2029 | "aaaaddbdaabyzc", | |
2030 | qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2031 | qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, | |
2032 | 2, | |
2033 | ], | |
2034 | [ | |
2035 | 0, | |
2036 | "aaaaddbdaabyzc", | |
2037 | qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) \s* (*FAIL) \s* c \1 /x, | |
2038 | qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) \s* z \s* c \1 /x, | |
2039 | 2, | |
2040 | ], | |
2041 | [ | |
2042 | 0, | |
2043 | "aaaaddbdaabyzc", | |
2044 | qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? (?:b|) (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2045 | qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? (?:b|) (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, | |
2046 | 2, | |
2047 | ], | |
2048 | [ | |
2049 | 0, | |
2050 | "aaaaddbdaabyzc", | |
2051 | qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? b{0,6} (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2052 | qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? b{0,6} (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, | |
2053 | 2, | |
2054 | ], | |
2055 | [ | |
2056 | 0, | |
2057 | "aaaaddbcdaabcyzc", | |
2058 | qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? (bc){0,6} (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2059 | qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? (bc){0,6} (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, | |
2060 | 2, | |
2061 | ], | |
2062 | [ | |
2063 | 0, | |
2064 | "aaaaddbcdaabcyzc", | |
2065 | qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? (bc*){0,6} (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, | |
2066 | qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? (bc*){0,6} (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, | |
2067 | 2, | |
2068 | ], | |
2069 | ||
2070 | ||
2071 | [ | |
2072 | 0, | |
2073 | "AbcdCBefgBhiBqz", | |
2074 | qr/(A (.*) (?{ $count_a++ }) C? (*THEN) | A D) (*FAIL)/x, | |
2075 | qr/(A (.*) (?{ $count_b++ }) C? (*THEN) | A D) z/x, | |
2076 | 1, | |
2077 | ], | |
2078 | [ | |
2079 | 0, | |
2080 | "AbcdCBefgBhiBqz", | |
2081 | qr/(A (.*) (?{ $count_a++ }) C? (*THEN) | A D) \s* (*FAIL)/x, | |
2082 | qr/(A (.*) (?{ $count_b++ }) C? (*THEN) | A D) \s* z/x, | |
2083 | 1, | |
2084 | ], | |
2085 | [ | |
2086 | 0, | |
2087 | "AbcdCBefgBhiBqz", | |
2088 | qr/(A (.*) (?{ $count_a++ }) (?:C|) (*THEN) | A D) (*FAIL)/x, | |
2089 | qr/(A (.*) (?{ $count_b++ }) (?:C|) (*THEN) | A D) z/x, | |
2090 | 1, | |
2091 | ], | |
2092 | [ | |
2093 | 0, | |
2094 | "AbcdCBefgBhiBqz", | |
2095 | qr/(A (.*) (?{ $count_a++ }) C{0,6} (*THEN) | A D) (*FAIL)/x, | |
2096 | qr/(A (.*) (?{ $count_b++ }) C{0,6} (*THEN) | A D) z/x, | |
2097 | 1, | |
2098 | ], | |
2099 | [ | |
2100 | 0, | |
2101 | "AbcdCEBefgBhiBqz", | |
2102 | qr/(A (.*) (?{ $count_a++ }) (CE){0,6} (*THEN) | A D) (*FAIL)/x, | |
2103 | qr/(A (.*) (?{ $count_b++ }) (CE){0,6} (*THEN) | A D) z/x, | |
2104 | 1, | |
2105 | ], | |
2106 | [ | |
2107 | 0, | |
2108 | "AbcdCBefgBhiBqz", | |
2109 | qr/(A (.*) (?{ $count_a++ }) (CE*){0,6} (*THEN) | A D) (*FAIL)/x, | |
2110 | qr/(A (.*) (?{ $count_b++ }) (CE*){0,6} (*THEN) | A D) z/x, | |
2111 | 1, | |
2112 | ], | |
2113 | ) { | |
2114 | $c++; | |
2115 | $count_a = 0; | |
2116 | $count_b = 0; | |
2117 | ||
2118 | my $match_a = ($re->[1] =~ $re->[2]) || 0; | |
2119 | my $match_b = ($re->[1] =~ $re->[3]) || 0; | |
2120 | ||
2121 | iseq($match_a, $re->[0], "match a " . ($re->[0] ? "succeeded" : "failed") . " ($c)"); | |
2122 | iseq($match_b, $re->[0], "match b " . ($re->[0] ? "succeeded" : "failed") . " ($c)"); | |
2123 | iseq($count_a, $re->[4], "count a ($c)"); | |
2124 | iseq($count_b, $re->[4], "count b ($c)"); | |
2125 | } | |
2126 | } | |
e425a60b | 2127 | |
b57e4118 KW |
2128 | { # Bleadperl v5.13.8-292-gf56b639 breaks NEZUMI/Unicode-LineBreak-1.011 |
2129 | # \xdf in lookbehind failed to compile as is multi-char fold | |
2130 | eval_ok 'qr{ | |
2131 | (?u: (?<=^url:) | | |
2132 | (?<=[/]) (?=[^/]) | | |
2133 | (?<=[^-.]) (?=[-~.,_?\#%=&]) | | |
2134 | (?<=[=&]) (?=.) | |
2135 | )}iox', "Lookbehind with \\xdf matchable compiles"; | |
2136 | } | |
2137 | ||
84193928 KW |
2138 | # RT #82610 |
2139 | ok 'foo/file.fob' =~ m,^(?=[^\.])[^/]*/(?=[^\.])[^/]*\.fo[^/]$,; | |
2140 | ||
e425a60b YO |
2141 | # |
2142 | # Keep the following tests last -- they may crash perl | |
2143 | # | |
2144 | print "# Tests that follow may crash perl\n"; | |
e425a60b YO |
2145 | { |
2146 | eval '/\k/'; | |
2147 | ok $@ =~ /\QSequence \k... not terminated in regex;\E/, | |
2148 | 'Lone \k not allowed'; | |
2149 | } | |
2150 | ||
e425a60b YO |
2151 | { |
2152 | local $Message = "Substitution with lookahead (possible segv)"; | |
2153 | $_ = "ns1ns1ns1"; | |
2154 | s/ns(?=\d)/ns_/g; | |
2155 | iseq $_, "ns_1ns_1ns_1"; | |
2156 | $_ = "ns1"; | |
2157 | s/ns(?=\d)/ns_/; | |
2158 | iseq $_, "ns_1"; | |
2159 | $_ = "123"; | |
2160 | s/(?=\d+)|(?<=\d)/!Bang!/g; | |
2161 | iseq $_, "!Bang!1!Bang!2!Bang!3!Bang!"; | |
2162 | } | |
2163 | ||
6182169b KW |
2164 | { |
2165 | # Earlier versions of Perl said this was fatal. | |
2166 | local $Message = "U+0FFFF shouldn't crash the regex engine"; | |
2167 | no warnings 'utf8'; | |
2168 | my $a = eval "chr(65535)"; | |
2169 | use warnings; | |
2170 | my $warning_message; | |
2171 | local $SIG{__WARN__} = sub { $warning_message = $_[0] }; | |
2172 | eval $a =~ /[a-z]/; | |
2173 | ok(1); # If it didn't crash, it worked. | |
2174 | } | |
b57e4118 KW |
2175 | |
2176 | # !!! NOTE that tests that aren't at all likely to crash perl should go | |
2177 | # a ways above, above these last ones. | |
e425a60b YO |
2178 | } # End of sub run_tests |
2179 | ||
2180 | 1; |