From 1c2f3d7a101e8233a39695492df7bc03c8e2b3bc Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 8 Dec 2019 12:16:29 -0700 Subject: [PATCH] PATCH GH #17025 \p{user-defined} overrides official Unicode Prior to this patch, they only sometimes overrode. --- charclass_invlists.h | 2 +- lib/unicore/mktables | 3 ++ lib/unicore/uni_keywords.pl | 2 +- pod/perldelta.pod | 14 ++++++++ pod/perlunicode.pod | 4 ++- regcharclass.h | 2 +- regcomp.c | 86 ++++++++++++++++++++++++++++++++++++++++----- t/re/regexp_unicode_prop.t | 10 ++---- uni_keywords.h | 2 +- 9 files changed, 104 insertions(+), 21 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 87cd593..e81fae5 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -395174,7 +395174,7 @@ static const U8 WB_table[23][23] = { * 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt * 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt * 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt - * 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables + * 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables * a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 6453656..6be1f41 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -17075,6 +17075,9 @@ other two shortcuts, and Unicode continues to define new properties that begin with C<"In">, so it's quite possible that a conflict will occur in the future. The compound form is guaranteed to not become obsolete, and its meaning is clearer anyway. See L for more information about this. + +User-defined properties must begin with "In" or "Is". These override any +Unicode property of the same name. END } my $text = $Is_flags_text; diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index afa9f64..a665a9c 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1260,7 +1260,7 @@ # 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt # 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt # 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt -# 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables +# 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables # a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl # e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 7ce4cf8..23d3fe7 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -45,6 +45,20 @@ XXX For a release on a stable branch, this section aspires to be: [ List each incompatible change as a =head2 entry ] +=head2 C<\p{I}> properties now always override official +Unicode ones + +Previously, if and only if a user-defined property was declared prior to +the compilation of the regular expression pattern containing it, its +definition was used instead of any official Unicode property with the +same name. Now, it always overrides the offical property. This +change could break existing code that relied (likely unwittingly) on the +previous behavior. Without this fix, if Unicode released a new version +with a new property that happens to have the same name as the one you +had long been using, your program would break when you upgraded to a +perl that used that new Unicode version. See L. [GH #17205] + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index e048df6..5905822 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -1066,7 +1066,9 @@ You can define your own binary character properties by defining subroutines whose names begin with C<"In"> or C<"Is">. (The experimental feature L provides an alternative which allows more complex definitions.) The subroutines can be defined in any -package. The user-defined properties can be used in the regular expression +package. They override any Unicode properties expressed as the same +names. The user-defined properties can be used in the regular +expression C<\p{}> and C<\P{}> constructs; if you are using a user-defined property from a package other than the one you are in, you must specify its package in the C<\p{}> or C<\P{}> construct. diff --git a/regcharclass.h b/regcharclass.h index cb25924..1bd317c 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -2245,7 +2245,7 @@ * 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt * 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt * 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt - * 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables + * 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables * a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl diff --git a/regcomp.c b/regcomp.c index 3202323..028fd06 100644 --- a/regcomp.c +++ b/regcomp.c @@ -426,6 +426,14 @@ struct RExC_state_t { #define _invlist_intersection_complement_2nd(a, b, output) \ _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) +/* We add a marker if we are deferring expansion of a potential user-defined + * property until it is needed at runtime the first time it is encountered in a + * pattern match. This marker that shouldn't conflict with any that could be + * in a legal name is appended to its name to indicate this. There is a string + * and character form */ +#define DEFERRED_PROP_EXPANSION_MARKERs "~" +#define DEFERRED_PROP_EXPANSION_MARKERc '~' + /* About scan_data_t. During optimisation we recurse through the regexp program performing @@ -19845,11 +19853,13 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, continue; } - /* Here, didn't find a legal hex number. Just add it from - * here to the next \n */ + /* Here, didn't find a legal hex number. Just add the text + * from here up to the next \n, omitting any trailing + * markers. */ remaining -= len; - len = strcspn(si_string, "\n"); + len = strcspn(si_string, + DEFERRED_PROP_EXPANSION_MARKERs "\n"); remaining -= len; if (matches_string) { sv_catpvn(matches_string, si_string, len); @@ -19860,6 +19870,13 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, sv_catpvs(matches_string, " "); si_string += len; + if ( remaining + && UCHARAT(si_string) + == DEFERRED_PROP_EXPANSION_MARKERc) + { + si_string++; + remaining--; + } if (remaining && UCHARAT(si_string) == '\n') { si_string++; remaining--; @@ -23099,7 +23116,7 @@ Perl_parse_uniprop_string(pTHX_ * Other parameters will be set on return as described below */ const char * const name, /* The first non-blank in the \p{}, \P{} */ - const Size_t name_len, /* Its length in bytes, not including any + Size_t name_len, /* Its length in bytes, not including any trailing space */ const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ const bool to_fold, /* ? Is this under /i */ @@ -23147,6 +23164,9 @@ Perl_parse_uniprop_string(pTHX_ qualified name */ bool invert_return = FALSE; /* ? Do we need to complement the result before returning it */ + bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an + explicit utf8:: package that we strip + off */ PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING; @@ -23205,6 +23225,17 @@ Perl_parse_uniprop_string(pTHX_ break; } + /* If this looks like it is a marker we inserted at compile time, + * ignore it; otherwise keep it as it would have been user input. */ + if ( UNLIKELY(cur == DEFERRED_PROP_EXPANSION_MARKERc) + && ! deferrable + && could_be_user_defined + && i == name_len - 1) + { + name_len--; + continue; + } + /* Otherwise, this character is part of the name. */ lookup_name[j++] = cur; @@ -23238,6 +23269,7 @@ Perl_parse_uniprop_string(pTHX_ lookup_name += STRLENs("utf8::"); j -= STRLENs("utf8::"); equals_pos -= STRLENs("utf8::"); + stripped_utf8_pkg = TRUE; } /* Here, we are either done with the whole property name, if it was simple; @@ -23634,7 +23666,29 @@ Perl_parse_uniprop_string(pTHX_ /* Here, the name could be for a user defined property, which are * implemented as subs. */ user_sub = get_cvn_flags(name, name_len, 0); - if (user_sub) { + if (! user_sub) { + + /* Here, the property name could be a user-defined one, but there + * is no subroutine to handle it (as of now). Defer handling it + * until runtime. Otherwise, a block defined by Unicode in a later + * release would get the synonym InFoo added for it, and existing + * code that used that name would suddenly break if it referred to + * the property before the sub was declared. See [perl #134146] */ + if (deferrable) { + goto definition_deferred; + } + + /* If we haven't already stripped the package name (if one), do so + * now so can look for an official property with the stripped name. + * */ + if (! stripped_utf8_pkg) { + lookup_name += non_pkg_begin; + j -= non_pkg_begin; + } + + /* Drop down to look up in the official properties */ + } + else { const char insecure[] = "Insecure user-defined property"; /* Here, there is a sub by the correct name. Normally we call it @@ -24270,18 +24324,34 @@ Perl_parse_uniprop_string(pTHX_ definition_deferred: + { + bool is_qualified = non_pkg_begin != 0; /* If has "::" */ + /* Here it could yet to be defined, so defer evaluation of this * until its needed at runtime. We need the fully qualified property name - * to avoid ambiguity, and a trailing newline */ + * to avoid ambiguity */ if (! fq_name) { fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, - non_pkg_begin != 0 /* If has "::" */ - ); + is_qualified); } + + /* If it didn't come with a package, or the package is utf8::, this + * actually could be an official Unicode property whose inclusion we + * are deferring until runtime to make sure that it isn't overridden by + * a user-defined property of the same name (which we haven't + * encountered yet). Add a marker to indicate this possibility, for + * use at such time when we first need the definition during pattern + * matching execution */ + if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) { + sv_catpvs(fq_name, DEFERRED_PROP_EXPANSION_MARKERs); + } + + /* We also need a trailing newline */ sv_catpvs(fq_name, "\n"); *user_defined_ptr = TRUE; return fq_name; + } } #endif diff --git a/t/re/regexp_unicode_prop.t b/t/re/regexp_unicode_prop.t index 6df2968..5c5a1d7 100644 --- a/t/re/regexp_unicode_prop.t +++ b/t/re/regexp_unicode_prop.t @@ -143,6 +143,7 @@ BEGIN { Dash => ['-'], ASCII_Hex_Digit => ['!-', 'A'], IsAsciiHexAndDash => ['-', 'A'], + InLatin1 => ['\x{0100}', '!\x{00FF}'], ); @USER_CASELESS_PROPERTIES = ( @@ -194,12 +195,6 @@ BEGIN { } } -# These override the official ones, so if found before defined, the official -# ones prevail, so can't test deferred definition -my @OVERRIDING_USER_DEFINED_PROPERTIES = ( - InLatin1 => ['\x{0100}', '!\x{00FF}'], -); - # # From the short properties we populate POSIX-like classes. # @@ -249,8 +244,7 @@ while (my ($class, $chars) = each %SHORT_PROPERTIES) { push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, "# POSIX like properties" => %d, - "# User defined properties" => @USER_DEFINED_PROPERTIES, - "# Overriding user defined properties" => @OVERRIDING_USER_DEFINED_PROPERTIES; + "# User defined properties" => @USER_DEFINED_PROPERTIES; # diff --git a/uni_keywords.h b/uni_keywords.h index c3bf4bf..5e0e630 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7283,7 +7283,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt * 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt * 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt - * 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables + * 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables * a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl -- 1.8.3.1