* baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
+ * a3f3caba903e4d39b6c7aaa7ea4d3a739e745b010ad51cf0e05f34ffa0ac2c04 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
# Test_WB()
Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
+Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid
+Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid
EOF_CODE
# Sort these so get results in same order on different runs of this
# baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
# 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
# 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
-# 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
+# a3f3caba903e4d39b6c7aaa7ea4d3a739e745b010ad51cf0e05f34ffa0ac2c04 lib/unicore/mktables
# 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
# 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
#define PMf_IS_QR (1U<<(PMf_BASE_SHIFT+15))
#define PMf_USE_RE_EVAL (1U<<(PMf_BASE_SHIFT+16)) /* use re'eval' in scope */
+/* Means that this is a subpattern being compiled while processing a \p{}
+ * wildcard. This isn't called from op.c, but it is passed as a pm flag. */
+#define PMf_WILDCARD (1U<<(PMf_BASE_SHIFT+17))
+
/* See comments at the beginning of these defines about adding bits. The
* highest bit position should be used, so that if PMf_BASE_SHIFT gets
* increased, the #error below will be triggered so that you will be reminded
[ List each incompatible change as a =head2 entry ]
+=head2 Certain pattern matching features are now prohibited in compiling
+Unicode property value wildcard subpatterns
+
+These few features are either inappropriate or interfere with the
+algorithm used to accomplish this task. The complete list is in
+L<perlunicode/Wildcards in Property Values>.
+
=head2 Stop pretending C<POSIX::mbstowcs> and C<POSIX::wcstombs> are
supported
(F) You attempted to use a feature of printf that is accessible from
only C. This usually means there's a better way to do it in Perl.
+=item Use of %s is not allowed in Unicode property wildcard
+subpatterns in regex; marked by S<<-- HERE> in m/%s/
+
+(F) You were using a wildcard subpattern a Unicode property value, and
+the subpattern contained something that is illegal. Not all regular
+expression capabilities are legal in such subpatterns, and this is one.
+Rewrite your subppattern to not use the offending construct.
+See L<perlunicode/Wildcards in Property Values>.
+
=item Use of -l on filehandle%s
(W io) A filehandle represents an opened file, and when you opened the file
No modifiers may follow the final delimiter. Instead, use
L<perlre/(?adlupimnsx-imnsx)> and/or
L<perlre/(?adluimnsx-imnsx:pattern)> to specify modifiers.
+However, certain modifiers are illegal in your wildcard subpattern.
+The only character set modifier specifiable is C</aa>;
+any other character set, and C<-m>, and C<p>, and C<s> are all illegal.
+Specifying modifiers like C<qr/.../gc> that aren't legal in the
+C<(?...)> notation normally raise a warning, but with wildcard
+subpatterns, their use is an error. The C<m> modifier is ineffective;
+everything that matches will be a single line.
+
+By default, your pattern is matched case-insensitively, as if C</i> had
+been specified. You can change this by saying C<(?-i)> in your pattern.
+
+There are also certain operations that are illegal. You can't nest
+C<\p{...}> and C<\P{...}> calls within a wildcard subpattern, and C<\G>
+doesn't make sense, so is also prohibited.
+
+And the C<*> quantifier (or its equivalent C<(0,}>) is illegal.
This feature is not available when the left-hand side is prefixed by
C<Is_>, nor for any form that is marked as "Discouraged" in
L<perluniprops/Discouraged>.
-By default, your pattern is matched case-insensitively, as if C</i> had
-been specified. You can change this by saying C<(?-i)> in your pattern.
-
This experimental feature has been added to begin to implement
L<https://www.unicode.org/reports/tr18/#Wildcard_Properties>. Using it
will raise a (default-on) warning in the
* baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
+ * a3f3caba903e4d39b6c7aaa7ea4d3a739e745b010ad51cf0e05f34ffa0ac2c04 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl
*
* pm_flags contains the PMf_* flags, typically based on those from the
* pm_flags field of the related PMOP. Currently we're only interested in
- * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
+ * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
*
* For many years this code had an initial sizing pass that calculated
* (sometimes incorrectly, leading to security holes) the size needed for the
/* && memCHRs("iogcmsx", *RExC_parse) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
+ if ((RExC_pm_flags & PMf_WILDCARD)) {
+ if (flagsp == & negflags) {
+ if (*RExC_parse == 'm') {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by <--
+ HERE in m/%s/ */
+ vFAIL("Use of modifier '-m' is not allowed in Unicode"
+ " property wildcard subpatterns");
+ }
+ }
+ else {
+ if (*RExC_parse == 's') {
+ goto modifier_illegal_in_wildcard;
+ }
+ }
+ }
+
switch (*RExC_parse) {
/* Code for the imsxn flags */
*(RExC_parse - 1));
NOT_REACHED; /*NOTREACHED*/
case GLOBAL_PAT_MOD: /* 'g' */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ goto modifier_illegal_in_wildcard;
+ }
+ /*FALLTHROUGH*/
case ONCE_PAT_MOD: /* 'o' */
if (ckWARN(WARN_REGEXP)) {
const I32 wflagbit = *RExC_parse == 'o'
break;
case CONTINUE_PAT_MOD: /* 'c' */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ goto modifier_illegal_in_wildcard;
+ }
if (ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
}
break;
case KEEPCOPY_PAT_MOD: /* 'p' */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ goto modifier_illegal_in_wildcard;
+ }
if (flagsp == &negflags) {
ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
} else {
case ':':
case ')':
+ if ( (RExC_pm_flags & PMf_WILDCARD)
+ && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+ {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by <--
+ HERE in m/%s/ */
+ vFAIL2("Use of modifier '%c' is not allowed in Unicode"
+ " property wildcard subpatterns",
+ has_charset_modifier);
+ }
+
if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
negflags |= RXf_PMf_EXTENDED_MORE;
}
}
vFAIL("Sequence (?... not terminated");
+
+ modifier_illegal_in_wildcard:
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
+ subpatterns in regex; marked by <-- HERE in m/%s/ */
+ vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
+ " subpatterns", *(RExC_parse - 1));
}
/*
do_curly:
if ((flags&SIMPLE)) {
if (min == 0 && max == REG_INFTY) {
+
+ /* Going from 0..inf is currently forbidden in wildcard
+ * subpatterns. The only reason is to make it harder to
+ * write patterns that take a long long time to halt, and
+ * because the use of this construct isn't necessary in
+ * matching Unicode property values */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by
+ <-- HERE in m/%s/ */
+ vFAIL("Use of quantifier '*' is not allowed in"
+ " Unicode property wildcard subpatterns");
+ /* Note, don't need to worry about {0,}, as a '}' isn't
+ * legal at all in wildcards, so wouldn't get this far
+ * */
+ }
reginsert(pRExC_state, STAR, ret, depth+1);
MARK_NAUGHTY(4);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
- ret = reg_node(pRExC_state, SBOL);
- /* SBOL is shared with /^/ so we set the flags so we can tell
- * /\A/ from /^/ in split. */
- FLAGS(REGNODE_p(ret)) = 1;
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ ret = reg_node(pRExC_state, MBOL);
+ }
+ else {
+ ret = reg_node(pRExC_state, SBOL);
+ /* SBOL is shared with /^/ so we set the flags so we can tell
+ * /\A/ from /^/ in split. */
+ FLAGS(REGNODE_p(ret)) = 1;
+ }
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode property
+ wildcard subpatterns in regex; marked by <-- HERE in m/%s/
+ */
+ vFAIL("Use of '\\G' is not allowed in Unicode property"
+ " wildcard subpatterns");
+ }
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_GPOS_SEEN;
*flagp |= SIMPLE;
vFAIL("\\K not permitted in lookahead/lookbehind");
}
case 'Z':
- ret = reg_node(pRExC_state, SEOL);
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ ret = reg_node(pRExC_state, MEOL);
+ }
+ else {
+ ret = reg_node(pRExC_state, SEOL);
+ }
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'z':
- ret = reg_node(pRExC_state, EOS);
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ ret = reg_node(pRExC_state, MEOL);
+ }
+ else {
+ ret = reg_node(pRExC_state, EOS);
+ }
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
{
char *e;
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by <--
+ HERE in m/%s/ */
+ vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
+ " wildcard subpatterns", value, *(RExC_parse - 1));
+ }
+
/* \p means they want Unicode semantics */
REQUIRE_UNI_RULES(flagp, 0);
S_compile_wildcard(pTHX_ const char * name, const STRLEN len,
const bool ignore_case)
{
- U32 flags = PMf_MULTILINE;
+ U32 flags = PMf_MULTILINE|PMf_WILDCARD;
REGEXP * subpattern_re;
PERL_ARGS_ASSERT_COMPILE_WILDCARD;
'/(?[\N{KEYCAP DIGIT NINE}/' => '\N{} here is restricted to one character {#} m/(?[\\N{U+39.FE0F.20E3{#}}/', # [perl #133988]
'/0000000000000000[\N{U+0.00}0000/' => 'Unmatched [ {#} m/0000000000000000[{#}\N{U+0.00}0000/', # [perl #134059]
'/\p{nv=\b5\b}/' => 'Can\'t find Unicode property definition "nv=\\b5\\b" {#} m/\\p{nv=\\b5\\b}{#}/',
+ '/\p{nv=:(?g)10:}/' => 'Use of modifier \'g\' is not allowed in Unicode property wildcard subpatterns {#} m/(?g{#})10/',
+ '/\p{gc=:L*:}/' => 'Use of quantifier \'*\' is not allowed in Unicode property wildcard subpatterns {#} m/L*{#}/',
+ '/\p{gc=:L\G:}/' => 'Use of \'\G\' is not allowed in Unicode property wildcard subpatterns {#} m/L\G{#}/',
+ '/\p{gc=:(?a)L:}/' => 'Use of modifier \'a\' is not allowed in Unicode property wildcard subpatterns {#} m/(?a){#}L/',
+ '/\p{gc=:(?u)L:}/' => 'Use of modifier \'u\' is not allowed in Unicode property wildcard subpatterns {#} m/(?u){#}L/',
+ '/\p{gc=:(?d)L:}/' => 'Use of modifier \'d\' is not allowed in Unicode property wildcard subpatterns {#} m/(?d){#}L/',
+ '/\p{gc=:(?l)L:}/' => 'Use of modifier \'l\' is not allowed in Unicode property wildcard subpatterns {#} m/(?l){#}L/',
+ '/\p{gc=:(?-m)L:}/' => 'Use of modifier \'-m\' is not allowed in Unicode property wildcard subpatterns {#} m/(?-m{#})L/',
+ '/\p{gc=:\pS:}/' => 'Use of \'\\pS\' is not allowed in Unicode property wildcard subpatterns {#} m/\\pS{#}/',
+ '/\p{gc=:\PS:}/' => 'Use of \'\\PS\' is not allowed in Unicode property wildcard subpatterns {#} m/\\PS{#}/',
+ '/\p{gc=:[\pS]:}/' => 'Use of \'\\pS\' is not allowed in Unicode property wildcard subpatterns {#} m/[\\pS{#}]/',
+ '/\p{gc=:[\PS]:}/' => 'Use of \'\\PS\' is not allowed in Unicode property wildcard subpatterns {#} m/[\\PS{#}]/',
);
# These are messages that are death under 'use re "strict"', and may or may
else {
no warnings 'experimental::regex_sets';
no warnings 'experimental::re_strict';
+ no warnings 'experimental::uniprop_wildcards';
warning_is(sub {
my $meaning_of_life;
* baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
+ * a3f3caba903e4d39b6c7aaa7ea4d3a739e745b010ad51cf0e05f34ffa0ac2c04 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl