This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Set up initial \p{} parse function.
authorKarl Williamson <khw@cpan.org>
Wed, 11 Apr 2018 19:05:08 +0000 (13:05 -0600)
committerKarl Williamson <khw@cpan.org>
Fri, 20 Apr 2018 07:11:53 +0000 (01:11 -0600)
This function will parse the interior of \p{} Unicode property names in
regular expression patterns.

The design of this function will be to return NULL on the properties it
cannot handle; otherwise it returns  an inversion list representing the
property it did find.  The current mechanism will be used to handle the
cases where this function returns NULL.

This initial state is just to have the function return NULL always, so
the existing mechanism is always used.  A later commit will add
the functionality in 5.28 that bypasses the existing mechanism.

embed.fnc
embed.h
proto.h
regcomp.c
utf8.c

index 7973d65..454a380 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -768,6 +768,10 @@ AMp        |UV     |to_uni_title   |UV c|NN U8 *p|NN STRLEN *lenp
 AbDMpR |bool   |isIDFIRST_lazy |NN const char* p
 AbDMpR |bool   |isALNUM_lazy   |NN const char* p
 p      |void   |init_uniprops
+EpX    |SV *   |parse_uniprop_string|NN const char * const name           \
+                                    |const Size_t len                     \
+                                    |const bool to_fold                   \
+                                    |NN bool * invert
 #ifdef PERL_IN_UTF8_C
 snR    |U8     |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp  \
                |const char dummy
diff --git a/embed.h b/embed.h
index ab22ebc..fb93ebc 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define multiconcat_stringify(a)       Perl_multiconcat_stringify(aTHX_ a)
 #define multideref_stringify(a,b)      Perl_multideref_stringify(aTHX_ a,b)
 #define op_clear(a)            Perl_op_clear(aTHX_ a)
+#define parse_uniprop_string(a,b,c,d)  Perl_parse_uniprop_string(aTHX_ a,b,c,d)
 #define qerror(a)              Perl_qerror(aTHX_ a)
 #define reg_named_buff(a,b,c,d)        Perl_reg_named_buff(aTHX_ a,b,c,d)
 #define reg_named_buff_iter(a,b,c)     Perl_reg_named_buff_iter(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index dff4714..c91141b 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -2673,6 +2673,9 @@ PERL_CALLCONV OP* Perl_parse_termexpr(pTHX_ U32 flags);
 PERL_CALLCONV U32      Perl_parse_unicode_opts(pTHX_ const char **popt);
 #define PERL_ARGS_ASSERT_PARSE_UNICODE_OPTS    \
        assert(popt)
+PERL_CALLCONV SV *     Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const bool to_fold, bool * invert);
+#define PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING  \
+       assert(name); assert(invert)
 PERL_CALLCONV void     Perl_parser_free(pTHX_ const yy_parser *parser);
 #define PERL_ARGS_ASSERT_PARSER_FREE   \
        assert(parser)
index 11899ee..a8023db 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -16721,6 +16721,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                         * anyway, to save a little time */
                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
 
+                SvREFCNT_dec(swash); /* Free any left-overs */
                if (RExC_parse >= RExC_end)
                    vFAIL2("Empty \\%c", (U8)value);
                if (*RExC_parse == '{') {
@@ -16775,11 +16776,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                    n = 1;
                }
                if (!SIZE_ONLY) {
-                    SV* invlist;
-                    char* name;
+                    char* name = RExC_parse;
                     char* base_name;    /* name after any packages are stripped */
                     char* lookup_name = NULL;
                     const char * const colon_colon = "::";
+                    bool invert;
+
+                    SV* invlist = parse_uniprop_string(name, n, FOLD, &invert);
+                    if (invlist) {
+                        if (invert) {
+                            value ^= 'P' ^ 'p';
+                        }
+                    }
+                    else {
 
                     /* Try to get the definition of the property into
                      * <invlist>.  If /i is in effect, the effective property
@@ -16798,7 +16807,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                     /* Look up the property name, and get its swash and
                      * inversion list, if the property is found  */
-                    SvREFCNT_dec(swash); /* Free any left-overs */
                     swash = _core_swash_init("utf8",
                                              (lookup_name)
                                               ? lookup_name
@@ -16898,14 +16906,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         {
                             has_user_defined_property = TRUE;
                         }
-                        else if
+                    }
+                    }
+                    if (invlist) {
+                        if (! has_user_defined_property &&
                             /* We warn on matching an above-Unicode code point
                              * if the match would return true, except don't
                              * warn for \p{All}, which has exactly one element
                              * = 0 */
                             (_invlist_contains_cp(invlist, 0x110000)
                                 && (! (_invlist_len(invlist) == 1
-                                       && *invlist_array(invlist) == 0)))
+                                       && *invlist_array(invlist) == 0))))
                         {
                             warn_super = TRUE;
                         }
@@ -16920,14 +16931,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             /* The swash can't be used as-is, because we've
                             * inverted things; delay removing it to here after
                             * have copied its invlist above */
-                            SvREFCNT_dec_NN(swash);
+                            if (! swash) {
+                                SvREFCNT_dec_NN(invlist);
+                            }
+                            SvREFCNT_dec(swash);
                             swash = NULL;
                         }
                         else {
                             _invlist_union(properties, invlist, &properties);
+                            if (! swash) {
+                                SvREFCNT_dec_NN(invlist);
+                            }
                        }
-                   }
-               }
+                    }
+                }
                RExC_parse = e + 1;
                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
                                                 named */
diff --git a/utf8.c b/utf8.c
index 96bb929..85dba52 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -5909,6 +5909,20 @@ Perl_init_uniprops(pTHX)
     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
 }
 
+SV *
+Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const bool to_fold, bool * invert)
+{
+
+    PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
+
+    PERL_UNUSED_ARG(name);
+    PERL_UNUSED_ARG(len);
+    PERL_UNUSED_ARG(to_fold);
+    PERL_UNUSED_ARG(invert);
+
+    return NULL;
+}
+
 /*
 =for apidoc utf8_to_uvchr