Commit | Line | Data |
---|---|---|
034602eb KW |
1 | BEGIN { |
2 | chdir 't' if -d 't'; | |
3 | require './test.pl'; | |
4 | set_up_inc('../lib'); | |
5 | } | |
6 | ||
7 | use strict; | |
8 | use warnings; | |
9 | ||
10 | $|=1; | |
11 | ||
12 | # The Script_Extension property has only recently become reasonably stable, so | |
13 | # later Unicode releases may change things. Some of these tests were | |
14 | # designed to provide more code covereage in regexec.c, so changes in it or | |
15 | # later Standards could cause them to not test what they originally were aimed | |
16 | # to do. | |
17 | ||
526c4163 KW |
18 | # Since there's so few tests currently, we can afford to try each syntax on |
19 | # all of them | |
3337229c | 20 | foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') { |
526c4163 KW |
21 | my $script_run; |
22 | eval '$script_run = qr/ ^ (*$type: .* ) $ /x;'; | |
034602eb | 23 | |
e54417e0 KW |
24 | unlike("\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}\N{CYRILLIC SMALL LETTER U}}\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}l", $script_run, "Cyrillic 'paypal' with a Latin 'l' is not a script run"); |
25 | unlike("A\N{GREEK CAPITAL LETTER GAMMA}", $script_run, "Latin followed by Greek isn't a script run"); | |
034602eb | 26 | |
e54417e0 KW |
27 | like("\N{CYRILLIC THOUSANDS SIGN}\N{COMBINING CYRILLIC TITLO}", $script_run, "Cyrillic followed by Permic-Arabic is Arabic"); |
28 | like("\N{OLD PERMIC LETTER AN}\N{COMBINING CYRILLIC TITLO}", $script_run, "Permic followed by Permic-Arabic is Permic"); | |
29 | unlike("\N{GLAGOLITIC CAPITAL LETTER AZU}\N{COMBINING CYRILLIC TITLO}", $script_run, "Glagolithic followed by Permic-Arabic isn't a script run"); | |
034602eb | 30 | |
e54417e0 KW |
31 | like("\N{CYRILLIC THOUSANDS SIGN}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Cyrillic followed by Glagolithic-Arabic is Arabic"); |
32 | like("\N{GLAGOLITIC CAPITAL LETTER AZU}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Glagolithic followed by Glagolithic-Arabic is Glagolithic"); | |
33 | unlike("\N{OLD PERMIC LETTER AN}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Permic followed by Glagolithic-Arabic isn't a script run"); | |
034602eb | 34 | |
e54417e0 KW |
35 | like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{ARABIC COMMA}\N{ARABIC-INDIC DIGIT FOUR}\N{THAANA LETTER HAA}", $script_run, "Arabic-Thaana chars followed by Thaana is Thaana"); |
36 | unlike("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}A", $script_run, "Arabic-Thaana chars followed by Latin isn't a script run"); | |
37 | like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{ARABIC COMMA}\N{ARABIC-INDIC DIGIT FOUR}\N{ARABIC NUMBER SIGN}", $script_run, "Arabic-Thaana chars followed by Arabic is Arabic"); | |
38 | unlike("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{EXTENDED ARABIC-INDIC DIGIT NINE}", $script_run, "Arabic-Thaana digits followed by an Arabic digit from a different sequence isn't a script run"); | |
39 | like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{THAANA LETTER HAA}", $script_run, "Arabic-Thaana digits followed by a Thaana leter is a script run"); | |
034602eb | 40 | |
e54417e0 KW |
41 | # The next tests are at a hard-coded boundary in regexec.c at the time of this |
42 | # writing (U+02B9/02BA). | |
43 | like("abc\N{MODIFIER LETTER SMALL Y}", $script_run, "All Latin is a script run"); | |
44 | like("abc\N{MODIFIER LETTER PRIME}", $script_run, "Latin then Common is a script run"); | |
45 | like(":a", $script_run, "Common then Latin is a script run"); | |
46 | like("-\N{SINHALA LETTER RAYANNA}", $script_run, "Common then Sinhala (which has its own 0) is a script run"); | |
81eabee7 | 47 | |
e54417e0 KW |
48 | like("\N{HEBREW LETTER ALEF}\N{HEBREW LETTER TAV}\N{MODIFIER LETTER PRIME}", $script_run, "Hebrew then Common is a script run"); |
49 | unlike("\N{HEBREW LETTER ALEF}\N{HEBREW LETTER TAV}\N{MODIFIER LETTER SMALL Y}", $script_run, "Hebrew then Latin isn't a script run"); | |
50 | like("9876543210\N{DESERET SMALL LETTER WU}", $script_run, "0-9 are the digits for Deseret"); | |
51 | like("\N{DESERET SMALL LETTER WU}9876543210", $script_run, "Also when they aren't in the initial position"); | |
f4e61fc0 KW |
52 | like("\N{DESERET SMALL LETTER WU}\N{FULLWIDTH DIGIT FIVE}", $script_run, "Fullwidth digits may be digits for Deseret"); |
53 | like("\N{FULLWIDTH DIGIT SIX}\N{DESERET SMALL LETTER LONG I}", $script_run, "... likewise if the digits come first"); | |
034602eb | 54 | |
e54417e0 KW |
55 | like("1234567890\N{ARABIC LETTER ALEF}", $script_run, "[0-9] work for Arabic"); |
56 | unlike("1234567890\N{ARABIC LETTER ALEF}\N{ARABIC-INDIC DIGIT FOUR}\N{ARABIC-INDIC DIGIT FIVE}", $script_run, "... but not in combination with real ARABIC digits"); | |
57 | unlike("\N{ARABIC LETTER ALEF}\N{ARABIC-INDIC DIGIT SIX}\N{ARABIC-INDIC DIGIT SEVEN}1", $script_run, "... nor when the ARABIC digits come before them"); | |
034602eb | 58 | |
e54417e0 KW |
59 | # This exercises the case where the script zero but not the script is |
60 | # ambiguous until a non-ambiguous digit is found. | |
61 | like("\N{ARABIC LETTER ALEF}\N{EXTENDED ARABIC-INDIC DIGIT EIGHT}", $script_run, "ARABIC with a Shia digit is a script run"); | |
034602eb | 62 | |
e54417e0 KW |
63 | like("\N{U+03A2}", $script_run, "A single unassigned code point is a script run"); |
64 | unlike("\N{U+03A2}\N{U+03A2}", $script_run, "But not more than one"); | |
65 | unlike("A\N{U+03A2}", $script_run, "... and not in combination with an assigned one"); | |
66 | unlike("\N{U+03A2}A", $script_run, "... in either order"); | |
67 | unlike("\N{U+03A2}0", $script_run, "... nor with a digit following"); | |
034602eb | 68 | |
e54417e0 KW |
69 | like("A\N{COMBINING GRAVE ACCENT}", $script_run, "An inherited script matches others"); |
70 | like("\N{COMBINING GRAVE ACCENT}A", $script_run, "... even if first in the sequence"); | |
034602eb | 71 | |
e54417e0 | 72 | like("\N{COMBINING TILDE}\N{COMBINING GRAVE ACCENT}", $script_run, "A script containing only inherited characters matches"); |
b548543c | 73 | |
e54417e0 | 74 | like("\N{DEVANAGARI DOUBLE DANDA}\N{DEVANAGARI DANDA}\N{DEVANAGARI STRESS SIGN UDATTA}\N{DEVANAGARI STRESS SIGN ANUDATTA}\N{NORTH INDIC FRACTION ONE QUARTER}\N{NORTH INDIC QUANTITY MARK}", $script_run, "A bunch of narrowing down of multiple possible scripts"); |
034602eb | 75 | |
e54417e0 KW |
76 | unlike("\N{JAVANESE PANGRANGKEP}\N{GEORGIAN PARAGRAPH SEPARATOR}", $script_run, "Two code points each in multiple scripts that don't intersect aren't a script run"); |
77 | like("\N{DEVANAGARI SIGN CANDRABINDU VIRAMA}\N{VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA}", $script_run, "Two code points each in multiple scripts that 't intersect singly are a script run"); | |
034602eb | 78 | |
e54417e0 | 79 | like("", $script_run, "An empty string is a script run"); |
ad85be64 | 80 | |
e54417e0 | 81 | use utf8; |
034602eb | 82 | |
e54417e0 KW |
83 | # From UTS 39 |
84 | like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han"); | |
7da8e27b KW |
85 | |
86 | unlike "\N{THAI DIGIT FIVE}1", $script_run, "Thai digit followed by '1'"; | |
87 | unlike "1\N{THAI DIGIT FIVE}", $script_run, "'1' followed by Thai digit "; | |
88 | unlike "\N{BENGALI DIGIT ZERO}\N{CHAKMA DIGIT SEVEN}", $script_run, | |
89 | "Two digits in same extended script but from different sets of 10"; | |
526c4163 | 90 | } |
034602eb | 91 | |
8638266f | 92 | # Until fixed, this was skipping the '[' |
e9c7e9d5 KW |
93 | unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x, |
94 | "Doesn't skip parts of exact matches"); | |
8638266f | 95 | |
e9c7e9d5 KW |
96 | like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run"); |
97 | unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, | |
98 | "Nested asr works to exclude some things"); | |
3337229c | 99 | |
393e5a45 KW |
100 | like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/, |
101 | "Script with own zero works with ASCII digits"); # perl #133547 | |
102 | like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/, | |
103 | "Script without own zero works with ASCII digits"); | |
104 | ||
f4e61fc0 KW |
105 | like("A\x{ff10}\x{ff19}B", qr/^(*sr:.{4})/, |
106 | "Non-ASCII Common digits work with Latin"); # perl #133547 | |
107 | like("A\x{ff10}BC", qr/^(*sr:.{4})/, | |
108 | "Non-ASCII Common digits work with Latin"); # perl #133547 | |
109 | like("A\x{1d7ce}\x{1d7cf}B", qr/^(*sr:.{4})/, | |
110 | "Non-ASCII Common digits work with Latin"); # perl #133547 | |
111 | like("A\x{1d7ce}BC", qr/^(*sr:.{4})/, | |
112 | "Non-ASCII Common digits work with Latin"); # perl #133547 | |
113 | like("\x{1d7ce}\x{1d7cf}AB", qr/^(*sr:.{4})/, | |
114 | "Non-ASCII Common digits work with Latin"); # perl #133547 | |
115 | like("α\x{1d7ce}βγ", qr/^(*sr:.{4})/, | |
116 | "Non-ASCII Common digits work with Greek"); # perl #133547 | |
117 | like("\x{1d7ce}αβγ", qr/^(*sr:.{4})/, | |
118 | "Non-ASCII Common digits work with Greek"); # perl #133547 | |
119 | ||
37fc2e9a | 120 | fresh_perl_is('print scalar "0" =~ m!(((*sr:()|)0)(*sr:)0|)!;', |
8e9f3eef KW |
121 | 1, {}, '[perl #133997]'); |
122 | ||
034602eb | 123 | done_testing(); |