This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Change syntax of script runs
[perl5.git] / t / re / script_run.t
CommitLineData
034602eb
KW
1BEGIN {
2 chdir 't' if -d 't';
3 require './test.pl';
4 set_up_inc('../lib');
5}
6
7use strict;
8use warnings;
9
10$|=1;
11
12# The Script_Extension property has only recently become reasonably stable, so
13# later Unicode releases may change things. Some of these tests were
14# designed to provide more code covereage in regexec.c, so changes in it or
15# later Standards could cause them to not test what they originally were aimed
16# to do.
17
18no warnings "experimental::script_run";
19
d9790612 20my $script_run = qr/ ^ (*script_run: .* ) $ /x;
034602eb
KW
21
22unlike("\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}\N{CYRILLIC SMALL LETTER U}}\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}l", $script_run, "Cyrillic 'paypal' with a Latin 'l' is not a script run");
23unlike("A\N{GREEK CAPITAL LETTER GAMMA}", $script_run, "Latin followed by Greek isn't a script run");
24
25like("\N{CYRILLIC THOUSANDS SIGN}\N{COMBINING CYRILLIC TITLO}", $script_run, "Cyrillic followed by Permic-Arabic is Arabic");
26like("\N{OLD PERMIC LETTER AN}\N{COMBINING CYRILLIC TITLO}", $script_run, "Permic followed by Permic-Arabic is Permic");
27unlike("\N{GLAGOLITIC CAPITAL LETTER AZU}\N{COMBINING CYRILLIC TITLO}", $script_run, "Glagolithic followed by Permic-Arabic isn't a script run");
28
29like("\N{CYRILLIC THOUSANDS SIGN}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Cyrillic followed by Glagolithic-Arabic is Arabic");
30like("\N{GLAGOLITIC CAPITAL LETTER AZU}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Glagolithic followed by Glagolithic-Arabic is Glagolithic");
31unlike("\N{OLD PERMIC LETTER AN}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Permic followed by Glagolithic-Arabic isn't a script run");
32
33like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{ARABIC COMMA}\N{ARABIC-INDIC DIGIT FOUR}\N{THAANA LETTER HAA}", $script_run, "Arabic-Thaana chars followed by Thaana is Thaana");
34unlike("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}A", $script_run, "Arabic-Thaana chars followed by Latin isn't a script run");
35like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{ARABIC COMMA}\N{ARABIC-INDIC DIGIT FOUR}\N{ARABIC NUMBER SIGN}", $script_run, "Arabic-Thaana chars followed by Arabic is Arabic");
36unlike("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{EXTENDED ARABIC-INDIC DIGIT NINE}", $script_run, "Arabic-Thaana digits followed by an Arabic digit from a different sequence isn't a script run");
37like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{THAANA LETTER HAA}", $script_run, "Arabic-Thaana digits followed by a Thaana leter is a script run");
38
39# The next tests are at a hard-coded boundary in regexec.c at the time of this
40# writing (U+02B9/02BA).
41like("abc\N{MODIFIER LETTER SMALL Y}", $script_run, "All Latin is a script run");
42like("abc\N{MODIFIER LETTER PRIME}", $script_run, "Latin then Common is a script run");
8535a06f 43like(":a", $script_run, "Common then Latin is a script run");
81eabee7
KW
44like("-\N{SINHALA LETTER RAYANNA}", $script_run, "Common then Sinhala (which has its own 0) is a script run");
45
034602eb
KW
46like("\N{HEBREW LETTER ALEF}\N{HEBREW LETTER TAV}\N{MODIFIER LETTER PRIME}", $script_run, "Hebrew then Common is a script run");
47unlike("\N{HEBREW LETTER ALEF}\N{HEBREW LETTER TAV}\N{MODIFIER LETTER SMALL Y}", $script_run, "Hebrew then Latin isn't a script run");
48like("9876543210\N{DESERET SMALL LETTER WU}", $script_run, "0-9 are the digits for Deseret");
49like("\N{DESERET SMALL LETTER WU}9876543210", $script_run, "Also when they aren't in the initial position");
50unlike("\N{DESERET SMALL LETTER WU}\N{FULLWIDTH DIGIT FIVE}", $script_run, "Fullwidth digits aren't the digits for Deseret");
51unlike("\N{FULLWIDTH DIGIT SIX}\N{DESERET SMALL LETTER LONG I}", $script_run, "... likewise if the digits come first");
52
53like("1234567890\N{ARABIC LETTER ALEF}", $script_run, "[0-9] work for Arabic");
54unlike("1234567890\N{ARABIC LETTER ALEF}\N{ARABIC-INDIC DIGIT FOUR}\N{ARABIC-INDIC DIGIT FIVE}", $script_run, "... but not in combination with real ARABIC digits");
55unlike("\N{ARABIC LETTER ALEF}\N{ARABIC-INDIC DIGIT SIX}\N{ARABIC-INDIC DIGIT SEVEN}1", $script_run, "... nor when the ARABIC digits come before them");
56
57# This exercises the case where the script zero but not the script is
58# ambiguous until a non-ambiguous digit is found.
59like("\N{ARABIC LETTER ALEF}\N{EXTENDED ARABIC-INDIC DIGIT EIGHT}", $script_run, "ARABIC with a Shia digit is a script run");
60
61like("\N{U+03A2}", $script_run, "A single unassigned code point is a script run");
62unlike("\N{U+03A2}\N{U+03A2}", $script_run, "But not more than one");
63unlike("A\N{U+03A2}", $script_run, "... and not in combination with an assigned one");
64unlike("\N{U+03A2}A", $script_run, "... in either order");
13d9cfd2 65unlike("\N{U+03A2}0", $script_run, "... nor with a digit following");
034602eb
KW
66
67like("A\N{COMBINING GRAVE ACCENT}", $script_run, "An inherited script matches others");
68like("\N{COMBINING GRAVE ACCENT}A", $script_run, "... even if first in the sequence");
69
b548543c
KW
70like("\N{COMBINING TILDE}\N{COMBINING GRAVE ACCENT}", $script_run, "A script containing only inherited characters matches");
71
034602eb
KW
72like("\N{DEVANAGARI DOUBLE DANDA}\N{DEVANAGARI DANDA}\N{DEVANAGARI STRESS SIGN UDATTA}\N{DEVANAGARI STRESS SIGN ANUDATTA}\N{NORTH INDIC FRACTION ONE QUARTER}\N{NORTH INDIC QUANTITY MARK}", $script_run, "A bunch of narrowing down of multiple possible scripts");
73
74unlike("\N{JAVANESE PANGRANGKEP}\N{GEORGIAN PARAGRAPH SEPARATOR}", $script_run, "Two code points each in multiple scripts that don't intersect aren't a script run");
75like("\N{DEVANAGARI SIGN CANDRABINDU VIRAMA}\N{VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA}", $script_run, "Two code points each in multiple scripts that 't intersect singly are a script run");
76
ad85be64
KW
77like("", $script_run, "An empty string is a script run");
78
034602eb
KW
79use utf8;
80
81# From UTS 39
82like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han");
83
84done_testing();