This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Replace code with fcn that does the same thing
[perl5.git] / regcomp.c
index 9319e60..ca0c7af 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -657,7 +657,13 @@ static const scan_data_t zero_scan_data = {
     UTF8fARG(UTF,                                                           \
              (xI(xC) > eC) /* Don't run off end */                          \
               ? eC - sC   /* Length before the <--HERE */                   \
-              : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ),            \
+              : ((xI_offset(xC) >= 0)                                       \
+                 ? xI_offset(xC)                                            \
+                 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
+                                    IVdf " trying to output message for "   \
+                                    " pattern %.*s",                        \
+                                    __FILE__, __LINE__, xI_offset(xC),      \
+                                    ((int) (eC - sC)), sC), 0)),            \
              sC),         /* The input pattern printed up to the <--HERE */ \
     UTF8fARG(UTF,                                                           \
              (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
@@ -5476,7 +5482,9 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                 /* Cannot expect anything... */
                 scan_commit(pRExC_state, data, minlenp, is_inf);
                data->pos_min += 1;
-               data->pos_delta += 1;
+                if (data->pos_delta != SSize_t_MAX) {
+                    data->pos_delta += 1;
+                }
                data->cur_is_floating = 1; /* float */
            }
        }
@@ -5608,7 +5616,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                     invert = 1;
                     /* FALLTHROUGH */
                case ASCII:
-                    my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
+                    my_invlist = invlist_clone(PL_Posix_ptrs[_CC_ASCII]);
 
                     /* This can be handled as a Posix class */
                     goto join_posix_and_ascii;
@@ -5619,9 +5627,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                     /* FALLTHROUGH */
                case POSIXA:
                     assert(FLAGS(scan) != _CC_ASCII);
-                    _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
-                                          PL_XPosix_ptrs[_CC_ASCII],
-                                          &my_invlist);
+                    my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)]);
                     goto join_posix_and_ascii;
 
                case NPOSIXD:
@@ -8595,7 +8601,7 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
     PERL_UNUSED_CONTEXT;
     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     SvCUR_set(invlist,
               (len == 0)
@@ -8623,8 +8629,8 @@ S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
 
     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
 
-    assert(SvTYPE(src) == SVt_INVLIST);
-    assert(SvTYPE(dest) == SVt_INVLIST);
+    assert(is_invlist(src));
+    assert(is_invlist(dest));
     assert(! invlist_is_iterating(src));
     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
 
@@ -8659,7 +8665,7 @@ S_get_invlist_previous_index_addr(SV* invlist)
      * */
     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     return &(((XINVLIST*) SvANY(invlist))->prev_index);
 }
@@ -8697,7 +8703,7 @@ S_invlist_trim(SV* invlist)
 
     PERL_ARGS_ASSERT_INVLIST_TRIM;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
 }
@@ -8707,7 +8713,7 @@ S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
 {
     PERL_ARGS_ASSERT_INVLIST_CLEAR;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     invlist_set_len(invlist, 0, 0);
     invlist_trim(invlist);
@@ -8733,7 +8739,7 @@ S_invlist_max(SV* const invlist)
 
     PERL_ARGS_ASSERT_INVLIST_MAX;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     /* Assumes worst case, in which the 0 element is not counted in the
      * inversion list, so subtracts 1 for that */
@@ -8764,7 +8770,7 @@ Perl__new_invlist(pTHX_ IV initial_size)
     invlist_set_len(new_list, 0, 0);
 
     /* Force iterinit() to be used to get iteration to work */
-    *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
+    invlist_iterfinish(new_list);
 
     *get_invlist_previous_index_addr(new_list) = 0;
 
@@ -8834,7 +8840,7 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
 
     PERL_ARGS_ASSERT_INVLIST_EXTEND;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     /* Add one to account for the zero element at the beginning which may not
      * be counted by the calling parameters */
@@ -9158,7 +9164,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 
     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
     assert(a != b);
-    assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST);
+    assert(*output == NULL || is_invlist(*output));
 
     len_b = _invlist_len(b);
     if (len_b == 0) {
@@ -9436,7 +9442,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
 
     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
     assert(a != b);
-    assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST);
+    assert(*i == NULL || is_invlist(*i));
 
     /* Special case if either one is empty */
     len_a = (a == NULL) ? 0 : _invlist_len(a);
@@ -9979,7 +9985,7 @@ S_get_invlist_iter_addr(SV* invlist)
 
     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
 
-    assert(SvTYPE(invlist) == SVt_INVLIST);
+    assert(is_invlist(invlist));
 
     return &(((XINVLIST*) SvANY(invlist))->iterator);
 }
@@ -10175,10 +10181,6 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
     }
 }
 
-void
-Perl__load_PL_utf8_foldclosures (pTHX)
-{
-}
 #endif
 
 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
@@ -10288,7 +10290,6 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
     }
     else {  /* Pattern is UTF-8 */
         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
-        STRLEN foldlen = UTF8SKIP(s);
         const U8* e = s + bytelen;
         IV fc;
 
@@ -10327,7 +10328,6 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
             /* And set up so the code below that looks in this folded
              * buffer instead of the node's string */
             e = d;
-            foldlen = UTF8SKIP(folded);
             s = folded;
         }
 
@@ -10348,8 +10348,8 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
         }
         else {  /* Single char fold */
             unsigned int k;
-            int first_folds_to;
-            const int * remaining_folds_to_list;
+            unsigned int first_folds_to;
+            const unsigned int * remaining_folds_to_list;
             Size_t folds_to_count;
 
             /* It matches itself */
@@ -10362,14 +10362,14 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
             for (k = 0; k < folds_to_count; k++) {
                 UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
 
-                    /* /aa doesn't allow folds between ASCII and non- */
-                    if ((OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
-                        && isASCII(c) != isASCII(fc))
-                    {
-                        continue;
-                    }
+                /* /aa doesn't allow folds between ASCII and non- */
+                if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
+                    && isASCII(c) != isASCII(fc))
+                {
+                    continue;
+                }
 
-                    invlist = add_cp_to_invlist(invlist, c);
+                invlist = add_cp_to_invlist(invlist, c);
             }
         }
     }
@@ -11245,6 +11245,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         RExC_parse++;
                         is_neg = TRUE;
                     }
+                    endptr = RExC_end;
                     if (grok_atoUV(RExC_parse, &unum, &endptr)
                         && unum <= I32_MAX
                     ) {
@@ -11483,6 +11484,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     }
                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                         UV uv;
+                        endptr = RExC_end;
                         if (grok_atoUV(RExC_parse, &uv, &endptr)
                             && uv <= I32_MAX
                         ) {
@@ -11518,6 +11520,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     /* (?(1)...) */
                    char c;
                     UV uv;
+                    endptr = RExC_end;
                     if (grok_atoUV(RExC_parse, &uv, &endptr)
                         && uv <= I32_MAX
                     ) {
@@ -12027,6 +12030,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                maxpos = next;
            RExC_parse++;
             if (isDIGIT(*RExC_parse)) {
+                endptr = RExC_end;
                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
                     vFAIL("Invalid quantifier in {,}");
                 if (uv >= REG_INFTY)
@@ -12040,6 +12044,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            else
                maxpos = RExC_parse;
             if (isDIGIT(*maxpos)) {
+                endptr = RExC_end;
                 if (!grok_atoUV(maxpos, &uv, &endptr))
                     vFAIL("Invalid quantifier in {,}");
                 if (uv >= REG_INFTY)
@@ -12240,8 +12245,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
   * *node_p, nor *code_point_p, nor *flagp.
   *
   * If <cp_count> is not NULL, the caller wants to know the length (in code
-  * points) that this \N sequence matches.  This is set even if the function
-  * returns FALSE, as detailed below.
+  * points) that this \N sequence matches.  This is set, and the input is
+  * parsed for errors, even if the function returns FALSE, as detailed below.
   *
   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
   *
@@ -12405,8 +12410,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
             /* Loop through the hex digits of the current code point */
             do {
                 /* Adding this digit will shift the result 4 bits.  If that
-                 * result would be above IV_MAX, it's overflow */
-                if (cp > IV_MAX >> 4) {
+                 * result would be above the legal max, it's overflow */
+                if (cp > MAX_LEGAL_CP >> 4) {
 
                     /* Find the end of the code point */
                     do {
@@ -12417,7 +12422,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
                      * in utf8.c */
                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
                         " permissible max is 0x%" UVxf,
-                        (int) (RExC_parse - start_digit), start_digit, IV_MAX);
+                        (int) (RExC_parse - start_digit), start_digit,
+                        MAX_LEGAL_CP);
                 }
 
                 /* Accumulate this (valid) digit into the running total */
@@ -12464,31 +12470,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
             }
 
             /* Here, looks like its really a multiple character sequence.  Fail
-             * if that's not what the caller wants. */
-            if (! node_p) {
-
-                /* But even if failing, we count the code points if requested, and
-                 * don't back up up the pointer as the caller is expected to
-                 * handle this situation */
-                if (cp_count) {
-                    char * dot = RExC_parse + 1;
-                    do {
-                        dot = (char *) memchr(dot, '.', endbrace - dot);
-                        if (! dot) {
-                            break;
-                        }
-                        count++;
-                        dot++;
-                    } while (dot < endbrace);
-                    count++;
-
-                    *cp_count = count;
-                    RExC_parse = endbrace;
-                    nextchar(pRExC_state);
-                }
-                else {  /* Back up the pointer. */
-                    RExC_parse = p;
-                }
+             * if that's not what the caller wants.  But continue with counting
+             * and error checking if they still want a count */
+            if (! node_p && ! cp_count) {
                 return FALSE;
             }
 
@@ -12496,26 +12480,39 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
              * form \x{char1}\x{char2}...  and then call reg recursively to
              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
              * atomicness, while not having to worry about special handling
-             * that some code points may have. */
+             * that some code points may have.  We don't create a subpattern,
+             * but go through the motions of code point counting and error
+             * checking, if the caller doesn't want a node returned. */
 
-            if (count == 1) {
+            if (node_p && count == 1) {
                 substitute_parse = newSVpvs("?:");
             }
 
           do_concat:
 
-            /* Convert to notation the rest of the code understands */
-            sv_catpv(substitute_parse, "\\x{");
-            sv_catpvn(substitute_parse, start_digit, RExC_parse - start_digit);
-            sv_catpv(substitute_parse, "}");
+            if (node_p) {
+                /* Convert to notation the rest of the code understands */
+                sv_catpvs(substitute_parse, "\\x{");
+                sv_catpvn(substitute_parse, start_digit,
+                                            RExC_parse - start_digit);
+                sv_catpvs(substitute_parse, "}");
+            }
 
             /* Move to after the dot (or ending brace the final time through.)
              * */
             RExC_parse++;
+            count++;
 
         } while (RExC_parse < endbrace);
 
-        sv_catpv(substitute_parse, ")");
+        if (! node_p) { /* Doesn't want the node */
+            assert (cp_count);
+
+            *cp_count = count;
+            return FALSE;
+        }
+
+        sv_catpvs(substitute_parse, ")");
 
 #ifdef EBCDIC
         /* The values are Unicode, and therefore have to be converted to native
@@ -12806,9 +12803,9 @@ S_new_regcurly(const char *s, const char *e)
  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
 
 static I32
-S_backref_value(char *p)
+S_backref_value(char *p, char *e)
 {
-    const char* endptr;
+    const char* endptr = e;
     UV val;
     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
         return (I32)val;
@@ -13354,7 +13351,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     if (RExC_parse >= RExC_end) {
                         goto unterminated_g;
                     }
-                    num = S_backref_value(RExC_parse);
+                    num = S_backref_value(RExC_parse, RExC_end);
                     if (num == 0)
                         vFAIL("Reference to invalid group 0");
                     else if (num == I32_MAX) {
@@ -13372,7 +13369,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     }
                 }
                 else {
-                    num = S_backref_value(RExC_parse);
+                    num = S_backref_value(RExC_parse, RExC_end);
                     /* bare \NNN might be backref or octal - if it is larger
                      * than or equal RExC_npar then it is assumed to be an
                      * octal escape. Note RExC_npar is +1 from the actual
@@ -13511,6 +13508,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * need to figure this out until pass 2) */
             bool maybe_exactfu = PASS2;
 
+            /* To see if RExC_uni_semantics changes during parsing of the node.
+             * */
+            bool uni_semantics_at_node_start;
+
             /* The node_type may change below, but since the size of the node
              * doesn't change, it works */
            ret = reg_node(pRExC_state, node_type);
@@ -13537,6 +13538,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                    || UTF8_IS_START(UCHARAT(RExC_parse)));
 
+            uni_semantics_at_node_start = RExC_uni_semantics;
+
             /* Here, we have a literal character.  Find the maximal string of
              * them in the input that we can fit into a single EXACTish node.
              * We quit at the first non-literal or when the node gets full, or
@@ -13743,7 +13746,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* NOTE, RExC_npar is 1 more than the actual number of
                          * parens we have seen so far, hence the < RExC_npar below. */
 
-                        if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
+                        if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
                         {  /* Not to be treated as an octal constant, go
                                    find backref */
                             --p;
@@ -13785,20 +13788,22 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    } /* End of switch on '\' */
                    break;
                case '{':
-                    /* Currently we allow an lbrace at the start of a construct
-                     * without raising a warning.  This is because we think we
-                     * will never want such a brace to be meant to be other
-                     * than taken literally. */
+                    /* Trying to gain new uses for '{' without breaking too
+                     * much existing code is hard.  The solution currently
+                     * adopted is:
+                     *  1)  If there is no ambiguity that a '{' should always
+                     *      be taken literally, at the start of a construct, we
+                     *      just do so.
+                     *  2)  If the literal '{' conflicts with our desired use
+                     *      of it as a metacharacter, we die.  The deprecation
+                     *      cycles for this have come and gone.
+                     *  3)  If there is ambiguity, we raise a simple warning.
+                     *      This could happen, for example, if the user
+                     *      intended it to introduce a quantifier, but slightly
+                     *      misspelled the quantifier.  Without this warning,
+                     *      the quantifier would silently be taken as a literal
+                     *      string of characters instead of a meta construct */
                    if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
-
-                        /* But, we raise a fatal warning otherwise, as the
-                         * deprecation cycle has come and gone.  Except that it
-                         * turns out that some heavily-relied on upstream
-                         * software, notably GNU Autoconf, have failed to fix
-                         * their uses.  For these, don't make it fatal unless
-                         * we anticipate using the '{' for something else.
-                         * This happens after any alpha, and for a looser {m,n}
-                         * quantifier specification */
                         if (      RExC_strict
                             || (  p > parse_start + 1
                                 && isALPHA_A(*(p - 1))
@@ -13810,10 +13815,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                   "illegal here");
                         }
                         if (PASS2) {
-                            ckWARNregdep(p + 1,
-                                        "Unescaped left brace in regex is "
-                                        "deprecated here (and will be fatal "
-                                        "in Perl 5.30), passed through");
+                            ckWARNreg(p + 1, "Unescaped left brace in regex is"
+                                             " passed through");
                         }
                    }
                    goto normal_default;
@@ -13974,6 +13977,25 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                 ender = 's';
                                 added_len = 2;
                             }
+                            else if (   uni_semantics_at_node_start
+                                     != RExC_uni_semantics)
+                            {
+                                /* Here, we are supossed to be using Unicode
+                                 * rules, but this folding node is not.  This
+                                 * happens during pass 1 when the node started
+                                 * out not under Unicode rules, but a \N{} was
+                                 * encountered during the processing of it,
+                                 * causing Unicode rules to be switched into.
+                                 * Pass 1 continues uninterrupted, as by the
+                                 * time we get to pass 2, we will know enough
+                                 * to generate the correct folds.  Except in
+                                 * this one case, we need to restart the node,
+                                 * because the fold of the sharp s requires 2
+                                 * characters, and the sizing needs to account
+                                 * for that. */
+                                p = oldp;
+                                goto loopdone;
+                            }
                             else {
                                 RExC_seen_unfolded_sharp_s = 1;
                                 maybe_exactfu = FALSE;
@@ -14303,8 +14325,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     /* Position parse to next real character */
     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
                                             FALSE /* Don't force to /x */ );
-    if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
-        ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through");
+    if (   PASS2 && *RExC_parse == '{'
+        && OP(ret) != SBOL && ! regcurly(RExC_parse))
+    {
+        if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
+            RExC_parse++;
+            vFAIL("Unescaped left brace in regex is illegal here");
+        }
+        ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
+                                  " passed through");
     }
 
     return(ret);
@@ -15695,7 +15724,7 @@ redo_curchar:
                  * fence.  Get rid of it */
                 fence_ptr = av_pop(fence_stack);
                 assert(fence_ptr);
-                fence = SvIV(fence_ptr) - 1;
+                fence = SvIV(fence_ptr);
                 SvREFCNT_dec_NN(fence_ptr);
                 fence_ptr = NULL;
 
@@ -15929,7 +15958,7 @@ redo_curchar:
     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
         || ((final = av_pop(stack)) == NULL)
         || ! IS_OPERAND(final)
-        || SvTYPE(final) != SVt_INVLIST
+        || ! is_invlist(final)
         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
     {
       bad_syntax:
@@ -16086,25 +16115,19 @@ S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
 STATIC void
 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
 {
-    /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
-     * innocent-looking character class, like /[ks]/i won't have to go out to
-     * disk to find the possible matches.
+    /* This adds the Latin1/above-Latin1 folding rules.
      *
      * This should be called only for a Latin1-range code points, cp, which is
      * known to be involved in a simple fold with other code points above
      * Latin1.  It would give false results if /aa has been specified.
      * Multi-char folds are outside the scope of this, and must be handled
-     * specially.
-     *
-     * XXX It would be better to generate these via regen, in case a new
-     * version of the Unicode standard adds new mappings, though that is not
-     * really likely, and may be caught by the default: case of the switch
-     * below. */
+     * specially. */
 
     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
 
     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
 
+    /* The rules that are valid for all Unicode versions are hard-coded in */
     switch (cp) {
         case 'k':
         case 'K':
@@ -16128,36 +16151,54 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
             break;
 
-#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+        default:    /* Other code points are checked against the data for the
+                       current Unicode version */
+          {
+            Size_t folds_to_count;
+            unsigned int first_folds_to;
+            const unsigned int * remaining_folds_to_list;
+            UV folded_cp;
 
-        case LATIN_SMALL_LETTER_SHARP_S:
-          *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
-            break;
+            if (isASCII(cp)) {
+                folded_cp = toFOLD(cp);
+            }
+            else {
+                U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
+                Size_t dummy_len;
+                folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
+            }
 
-#endif
+            if (folded_cp > 255) {
+                *invlist = add_cp_to_invlist(*invlist, folded_cp);
+            }
 
-#if    UNICODE_MAJOR_VERSION < 3                                        \
-   || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
+            folds_to_count = _inverse_folds(folded_cp, &first_folds_to,
+                                                    &remaining_folds_to_list);
+            if (folds_to_count == 0) {
 
-        /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
-         * U+0131.  */
-        case 'i':
-        case 'I':
-          *invlist =
-             add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
-#   if UNICODE_DOT_DOT_VERSION == 1
-          *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
-#   endif
-            break;
-#endif
+                /* Use deprecated warning to increase the chances of this being
+                 * output */
+                if (PASS2) {
+                    ckWARN2reg_d(RExC_parse,
+                        "Perl folding rules are not up-to-date for 0x%02X;"
+                        " please use the perlbug utility to report;", cp);
+                }
+            }
+            else {
+                unsigned int i;
 
-        default:
-            /* Use deprecated warning to increase the chances of this being
-             * output */
-            if (PASS2) {
-                ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+                if (first_folds_to > 255) {
+                    *invlist = add_cp_to_invlist(*invlist, first_folds_to);
+                }
+                for (i = 0; i < folds_to_count - 1; i++) {
+                    if (remaining_folds_to_list[i] > 255) {
+                        *invlist = add_cp_to_invlist(*invlist,
+                                                    remaining_folds_to_list[i]);
+                    }
+                }
             }
             break;
+         }
     }
 }
 
@@ -16684,6 +16725,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
            case 'P':
                {
                char *e;
+                char *i;
+
 
                 /* We will handle any undefined properties ourselves */
                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
@@ -16693,6 +16736,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                         * anyway, to save a little time */
                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
 
+                SvREFCNT_dec(swash); /* Free any left-overs */
                if (RExC_parse >= RExC_end)
                    vFAIL2("Empty \\%c", (U8)value);
                if (*RExC_parse == '{') {
@@ -16704,6 +16748,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     }
 
                     RExC_parse++;
+
+                    /* White space is allowed adjacent to the braces and after
+                     * any '^', even when not under /x */
                     while (isSPACE(*RExC_parse)) {
                          RExC_parse++;
                    }
@@ -16727,6 +16774,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                    n = e - RExC_parse;
                    while (isSPACE(*(RExC_parse + n - 1)))
                        n--;
+
                }   /* The \p isn't immediately followed by a '{' */
                else if (! isALPHA(*RExC_parse)) {
                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
@@ -16739,11 +16787,34 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                    n = 1;
                }
                if (!SIZE_ONLY) {
-                    SV* invlist;
-                    char* name;
+                    char* name = RExC_parse;
                     char* base_name;    /* name after any packages are stripped */
                     char* lookup_name = NULL;
                     const char * const colon_colon = "::";
+                    bool invert;
+
+                    SV* invlist;
+
+                    /* Temporary workaround for [perl #133136].  For this
+                    * precise input that is in the .t that is failing, load
+                    * utf8.pm, which is what the test wants, so that that
+                    * .t passes */
+                    if (     memEQs(RExC_start, e + 1 - RExC_start,
+                                    "foo\\p{Alnum}")
+                        && ! hv_common(GvHVn(PL_incgv),
+                                       NULL,
+                                       "utf8.pm", sizeof("utf8.pm") - 1,
+                                       0, HV_FETCH_ISEXISTS, NULL, 0))
+                    {
+                        require_pv("utf8.pm");
+                    }
+                    invlist = parse_uniprop_string(name, n, FOLD, &invert);
+                    if (invlist) {
+                        if (invert) {
+                            value ^= 'P' ^ 'p';
+                        }
+                    }
+                    else {
 
                     /* Try to get the definition of the property into
                      * <invlist>.  If /i is in effect, the effective property
@@ -16752,6 +16823,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
                     SAVEFREEPV(name);
+
+                    for (i = RExC_parse; i < RExC_parse + n; i++) {
+                        if (isCNTRL(*i) && *i != '\t') {
+                            RExC_parse = e + 1;
+                            vFAIL2("Can't find Unicode property definition \"%s\"", name);
+                        }
+                    }
+
                     if (FOLD) {
                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
 
@@ -16762,7 +16841,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                     /* Look up the property name, and get its swash and
                      * inversion list, if the property is found  */
-                    SvREFCNT_dec(swash); /* Free any left-overs */
                     swash = _core_swash_init("utf8",
                                              (lookup_name)
                                               ? lookup_name
@@ -16862,19 +16940,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         {
                             has_user_defined_property = TRUE;
                         }
-                        else if
+                    }
+                    }
+                    if (invlist) {
+                        if (! has_user_defined_property &&
                             /* We warn on matching an above-Unicode code point
                              * if the match would return true, except don't
                              * warn for \p{All}, which has exactly one element
                              * = 0 */
                             (_invlist_contains_cp(invlist, 0x110000)
                                 && (! (_invlist_len(invlist) == 1
-                                       && *invlist_array(invlist) == 0)))
+                                       && *invlist_array(invlist) == 0))))
                         {
                             warn_super = TRUE;
                         }
 
-
                         /* Invert if asking for the complement */
                         if (value == 'P') {
                            _invlist_union_complement_2nd(properties,
@@ -16884,14 +16964,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             /* The swash can't be used as-is, because we've
                             * inverted things; delay removing it to here after
                             * have copied its invlist above */
-                            SvREFCNT_dec_NN(swash);
+                            if (! swash) {
+                                SvREFCNT_dec_NN(invlist);
+                            }
+                            SvREFCNT_dec(swash);
                             swash = NULL;
                         }
                         else {
                             _invlist_union(properties, invlist, &properties);
+                            if (! swash) {
+                                SvREFCNT_dec_NN(invlist);
+                            }
                        }
-                   }
-               }
+                    }
+                } /* End of actually getting the values in pass 2 */
+
                RExC_parse = e + 1;
                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
                                                 named */
@@ -17143,21 +17230,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     }
                 }
                 else if (  UNI_SEMANTICS
+                        || AT_LEAST_ASCII_RESTRICTED
                         || classnum == _CC_ASCII
                         || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
                                                   || classnum == _CC_XDIGIT)))
                 {
-                    /* We usually have to worry about /d and /a affecting what
-                     * POSIX classes match, with special code needed for /d
-                     * because we won't know until runtime what all matches.
-                     * But there is no extra work needed under /u, and
-                     * [:ascii:] is unaffected by /a and /d; and :digit: and
-                     * :xdigit: don't have runtime differences under /d.  So we
-                     * can special case these, and avoid some extra work below,
-                     * and at runtime. */
+                    /* We usually have to worry about /d a affecting what POSIX
+                     * classes match, with special code needed because we won't
+                     * know until runtime what all matches.  But there is no
+                     * extra work needed under /u and /a; and [:ascii:] is
+                     * unaffected by /d; and :digit: and :xdigit: don't have
+                     * runtime differences under /d.  So we can special case
+                     * these, and avoid some extra work below, and at runtime.
+                     * */
                     _invlist_union_maybe_complement_2nd(
                                                      simple_posixes,
-                                                     PL_XPosix_ptrs[classnum],
+                                                      ((AT_LEAST_ASCII_RESTRICTED)
+                                                       ? PL_Posix_ptrs[classnum]
+                                                       : PL_XPosix_ptrs[classnum]),
                                                      namedclass % 2 != 0,
                                                      &simple_posixes);
                 }
@@ -17536,7 +17626,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
            because too confusing */
         if (invert) {
-            sv_catpv(substitute_parse, "(?:");
+            sv_catpvs(substitute_parse, "(?:");
         }
 #endif
 
@@ -17556,7 +17646,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                                                 &PL_sv_undef)
                 {
                     if (! first_time) {
-                        sv_catpv(substitute_parse, "|");
+                        sv_catpvs(substitute_parse, "|");
                     }
                     first_time = FALSE;
 
@@ -17568,24 +17658,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         /* If the character class contains anything else besides these
          * multi-character folds, have to include it in recursive parsing */
         if (element_count) {
-            sv_catpv(substitute_parse, "|[");
+            sv_catpvs(substitute_parse, "|[");
             prefix_end = SvCUR(substitute_parse);
             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
 
             /* Put in a closing ']' only if not going off the end, as otherwise
              * we are adding something that really isn't there */
             if (RExC_parse < RExC_end) {
-                sv_catpv(substitute_parse, "]");
+                sv_catpvs(substitute_parse, "]");
             }
         }
 
-        sv_catpv(substitute_parse, ")");
+        sv_catpvs(substitute_parse, ")");
 #if 0
         if (invert) {
             /* This is a way to get the parse to skip forward a whole named
              * sequence instead of matching the 2nd character when it fails the
              * first */
-            sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
+            sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
         }
 #endif
 
@@ -17872,8 +17962,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     STRLEN foldlen;
                     unsigned int k;
                     Size_t folds_to_count;
-                    int first_folds_to;
-                    const int * remaining_folds_to_list;
+                    unsigned int first_folds_to;
+                    const unsigned int * remaining_folds_to_list;
 
                     if (j < 256) {
 
@@ -17927,33 +18017,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                      /* Then the remaining ones */
                                    : remaining_folds_to_list[k-2];
 
-                            /* /aa doesn't allow folds between ASCII and non- */
-                            if ((ASCII_FOLD_RESTRICTED
-                                && (isASCII(c) != isASCII(j))))
-                            {
-                                continue;
-                            }
+                        /* /aa doesn't allow folds between ASCII and non- */
+                        if ((   ASCII_FOLD_RESTRICTED
+                            && (isASCII(c) != isASCII(j))))
+                        {
+                            continue;
+                        }
 
-                            /* Folds under /l which cross the 255/256 boundary
-                             * are added to a separate list.  (These are valid
-                             * only when the locale is UTF-8.) */
-                            if (c < 256 && LOC) {
-                                *use_list = add_cp_to_invlist(*use_list, c);
-                                continue;
-                            }
+                        /* Folds under /l which cross the 255/256 boundary are
+                         * added to a separate list.  (These are valid only
+                         * when the locale is UTF-8.) */
+                        if (c < 256 && LOC) {
+                            *use_list = add_cp_to_invlist(*use_list, c);
+                            continue;
+                        }
 
-                            if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
-                            {
-                                cp_list = add_cp_to_invlist(cp_list, c);
-                            }
-                            else {
-                                /* Similarly folds involving non-ascii Latin1
-                                * characters under /d are added to their list */
-                                has_upper_latin1_only_utf8_matches
-                                        = add_cp_to_invlist(
-                                           has_upper_latin1_only_utf8_matches,
-                                           c);
-                            }
+                        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+                        {
+                            cp_list = add_cp_to_invlist(cp_list, c);
+                        }
+                        else {
+                            /* Similarly folds involving non-ascii Latin1
+                             * characters under /d are added to their list */
+                            has_upper_latin1_only_utf8_matches
+                                = add_cp_to_invlist(
+                                            has_upper_latin1_only_utf8_matches,
+                                            c);
+                        }
                     }
                 }
             }
@@ -17981,26 +18071,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
     }
     if (posixes || nposixes) {
-
-        /* We have to adjust /a and /aa */
-        if (AT_LEAST_ASCII_RESTRICTED) {
-
-            /* Under /a and /aa, nothing above ASCII matches these */
-            if (posixes) {
-                _invlist_intersection(posixes,
-                                    PL_XPosix_ptrs[_CC_ASCII],
-                                    &posixes);
-            }
-
-            /* Under /a and /aa, everything above ASCII matches these
-             * complements */
-            if (nposixes) {
-                _invlist_union_complement_2nd(nposixes,
-                                              PL_XPosix_ptrs[_CC_ASCII],
-                                              &nposixes);
-            }
-        }
-
         if (! DEPENDS_SEMANTICS) {
 
             /* For everything but /d, we can just add the current 'posixes' and
@@ -19953,11 +20023,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         U8 index = FLAGS(o) * 2;
         if (index < C_ARRAY_LENGTH(anyofs)) {
             if (*anyofs[index] != '[')  {
-                sv_catpv(sv, "[");
+                sv_catpvs(sv, "[");
             }
             sv_catpv(sv, anyofs[index]);
             if (*anyofs[index] != '[')  {
-                sv_catpv(sv, "]");
+                sv_catpvs(sv, "]");
             }
         }
         else {
@@ -21411,6 +21481,600 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
 
 #endif /* DEBUGGING */
 
+#ifndef PERL_IN_XSUB_RE
+
+#include "uni_keywords.h"
+
+void
+Perl_init_uniprops(pTHX)
+{
+    /* Set up the inversion list global variables */
+
+    PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
+    PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
+    PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
+    PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
+    PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
+    PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
+    PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
+    PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
+    PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
+    PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
+    PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
+    PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
+    PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
+    PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
+    PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
+    PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
+
+    PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
+    PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
+    PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
+    PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
+    PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
+    PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
+    PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
+    PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
+    PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
+    PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
+    PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
+    PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
+    PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
+    PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
+    PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
+    PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
+
+    PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
+    PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
+    PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
+    PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
+    PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
+
+    PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
+    PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
+    PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
+
+    PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
+
+    PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
+    PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
+
+    PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
+    PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
+
+    PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
+    PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
+                                            UNI__PERL_FOLDS_TO_MULTI_CHAR]);
+    PL_NonL1NonFinalFold = _new_invlist_C_array(
+                                            NonL1_Perl_Non_Final_Folds_invlist);
+
+    PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
+    PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
+    PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
+    PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
+    PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
+    PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
+    PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
+
+    /* The below are used only by deprecated functions.  They could be removed */
+    PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
+    PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
+    PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
+}
+
+SV *
+Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
+                                const bool to_fold, bool * invert)
+{
+    /* Parse the interior meat of \p{} passed to this in 'name' with length
+     * 'name_len', and return an inversion list if a property with 'name' is
+     * found, or NULL if not.  'name' point to the input with leading and
+     * trailing space trimmed.  'to_fold' indicates if /i is in effect.
+     *
+     * When the return is an inversion list, '*invert' will be set to a boolean
+     * indicating if it should be inverted or not
+     *
+     * This currently doesn't handle all cases.  A NULL return indicates the
+     * caller should try a different approach
+     */
+
+    char* lookup_name;
+    bool stricter = FALSE;
+    bool is_nv_type = FALSE;         /* nv= or numeric_value=, or possibly one
+                                        of the cjk numeric properties (though
+                                        it requires extra effort to compile
+                                        them) */
+    unsigned int i;
+    unsigned int j = 0, lookup_len;
+    int equals_pos = -1;        /* Where the '=' is found, or negative if none */
+    int slash_pos = -1;        /* Where the '/' is found, or negative if none */
+    int table_index = 0;
+    bool starts_with_In_or_Is = FALSE;
+    Size_t lookup_offset = 0;
+
+    PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
+
+    /* The input will be modified into 'lookup_name' */
+    Newx(lookup_name, name_len, char);
+    SAVEFREEPV(lookup_name);
+
+    /* Parse the input. */
+    for (i = 0; i < name_len; i++) {
+        char cur = name[i];
+
+        /* These characters can be freely ignored in most situations.  Later it
+         * may turn out we shouldn't have ignored them, and we have to reparse,
+         * but we don't have enough information yet to make that decision */
+        if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
+            continue;
+        }
+
+        /* Case differences are also ignored.  Our lookup routine assumes
+         * everything is lowercase */
+        if (isUPPER_A(cur)) {
+            lookup_name[j++] = toLOWER(cur);
+            continue;
+        }
+
+        /* A double colon is either an error, or a package qualifier to a
+         * subroutine user-defined property; neither of which do we currently
+         * handle
+         *
+         * But a single colon is a synonym for '=' */
+        if (cur == ':') {
+            if (i < name_len - 1 && name[i+1] == ':') {
+                return NULL;
+            }
+            cur = '=';
+        }
+
+        /* Otherwise, this character is part of the name. */
+        lookup_name[j++] = cur;
+
+        /* Only the equals sign needs further processing */
+        if (cur == '=') {
+            equals_pos = j; /* Note where it occurred in the input */
+            break;
+        }
+    }
+
+    /* Here, we are either done with the whole property name, if it was simple;
+     * or are positioned just after the '=' if it is compound. */
+
+    if (equals_pos >= 0) {
+        assert(! stricter); /* We shouldn't have set this yet */
+
+        /* Space immediately after the '=' is ignored */
+        i++;
+        for (; i < name_len; i++) {
+            if (! isSPACE_A(name[i])) {
+                break;
+            }
+        }
+
+        /* Certain properties need special handling.  They may optionally be
+         * prefixed by 'is'.  Ignore that prefix for the purposes of checking
+         * if this is one of those properties */
+        if (memBEGINPs(lookup_name, name_len, "is")) {
+            lookup_offset = 2;
+        }
+
+        /* Then check if it is one of these properties.  This is hard-coded
+         * because easier this way, and the list is unlikely to change.  There
+         * are several properties like this in the Unihan DB, which is unlikely
+         * to be compiled, and they all end with 'numeric'.  The interiors
+         * aren't checked for the precise property.  This would stop working if
+         * a cjk property were to be created that ended with 'numeric' and
+         * wasn't a numeric type */
+        is_nv_type = memEQs(lookup_name + lookup_offset,
+                       j - 1 - lookup_offset, "numericvalue")
+                  || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "nv")
+                  || (   memENDPs(lookup_name + lookup_offset,
+                            j - 1 - lookup_offset, "numeric")
+                      && (   memBEGINPs(lookup_name + lookup_offset,
+                                      j - 1 - lookup_offset, "cjk")
+                          || memBEGINPs(lookup_name + lookup_offset,
+                                      j - 1 - lookup_offset, "k")));
+        if (   is_nv_type
+            || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "canonicalcombiningclass")
+            || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "ccc")
+            || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "age")
+            || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "in")
+            || memEQs(lookup_name + lookup_offset,
+                      j - 1 - lookup_offset, "presentin"))
+        {
+            unsigned int k;
+
+            /* What makes these properties special is that the stuff after the
+             * '=' is a number.  Therefore, we can't throw away '-'
+             * willy-nilly, as those could be a minus sign.  Other stricter
+             * rules also apply.  However, these properties all can have the
+             * rhs not be a number, in which case they contain at least one
+             * alphabetic.  In those cases, the stricter rules don't apply.
+             * But the numeric type properties can have the alphas [Ee] to
+             * signify an exponent, and it is still a number with stricter
+             * rules.  So look for an alpha that signifys not-strict */
+            stricter = TRUE;
+            for (k = i; k < name_len; k++) {
+                if (   isALPHA_A(name[k])
+                    && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
+                {
+                    stricter = FALSE;
+                    break;
+                }
+            }
+        }
+
+        if (stricter) {
+
+            /* A number may have a leading '+' or '-'.  The latter is retained
+             * */
+            if (name[i] == '+') {
+                i++;
+            }
+            else if (name[i] == '-') {
+                lookup_name[j++] = '-';
+                i++;
+            }
+
+            /* Skip leading zeros including single underscores separating the
+             * zeros, or between the final leading zero and the first other
+             * digit */
+            for (; i < name_len - 1; i++) {
+                if (   name[i] != '0'
+                    && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
+                {
+                    break;
+                }
+            }
+        }
+    }
+    else {  /* No '=' */
+
+       /* We are now in a position to determine if this property should have
+        * been parsed using stricter rules.  Only a few are like that, and
+        * unlikely to change. */
+        if (   memBEGINPs(lookup_name, j, "perl")
+            && memNEs(lookup_name + 4, j - 4, "space")
+            && memNEs(lookup_name + 4, j - 4, "word"))
+        {
+            stricter = TRUE;
+
+            /* We set the inputs back to 0 and the code below will reparse,
+             * using strict */
+            i = j = 0;
+        }
+    }
+
+    /* Here, we have either finished the property, or are positioned to parse
+     * the remainder, and we know if stricter rules apply.  Finish out, if not
+     * already done */
+    for (; i < name_len; i++) {
+        char cur = name[i];
+
+        /* In all instances, case differences are ignored, and we normalize to
+         * lowercase */
+        if (isUPPER_A(cur)) {
+            lookup_name[j++] = toLOWER(cur);
+            continue;
+        }
+
+        /* An underscore is skipped, but not under strict rules unless it
+         * separates two digits */
+        if (cur == '_') {
+            if (    stricter
+                && (     i == 0 || (int) i == equals_pos || i == name_len- 1
+                    || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
+            {
+                lookup_name[j++] = '_';
+            }
+            continue;
+        }
+
+        /* Hyphens are skipped except under strict */
+        if (cur == '-' && ! stricter) {
+            continue;
+        }
+
+        /* XXX Bug in documentation.  It says white space skipped adjacent to
+         * non-word char.  Maybe we should, but shouldn't skip it next to a dot
+         * in a number */
+        if (isSPACE_A(cur) && ! stricter) {
+            continue;
+        }
+
+        lookup_name[j++] = cur;
+
+        /* Unless this is a non-trailing slash, we are done with it */
+        if (i >= name_len - 1 || cur != '/') {
+            continue;
+        }
+
+        slash_pos = j;
+
+        /* A slash in the 'numeric value' property indicates that what follows
+         * is a denominator.  It can have a leading '+' and '0's that should be
+         * skipped.  But we have never allowed a negative denominator, so treat
+         * a minus like every other character.  (No need to rule out a second
+         * '/', as that won't match anything anyway */
+        if (is_nv_type) {
+            i++;
+            if (i < name_len && name[i] == '+') {
+                i++;
+            }
+
+            /* Skip leading zeros including underscores separating digits */
+            for (; i < name_len - 1; i++) {
+                if (   name[i] != '0'
+                    && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
+                {
+                    break;
+                }
+            }
+
+            /* Store the first real character in the denominator */
+            lookup_name[j++] = name[i];
+        }
+    }
+
+    /* Here are completely done parsing the input 'name', and 'lookup_name'
+     * contains a copy, normalized.
+     *
+     * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
+     * different from without the underscores.  */
+    if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
+           || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
+        && UNLIKELY(name[name_len-1] == '_'))
+    {
+        lookup_name[j++] = '&';
+    }
+    else if (name_len > 2 && name[0] == 'I' && (   name[1] == 'n'
+                                                || name[1] == 's'))
+    {
+
+        /* Also, if the original input began with 'In' or 'Is', it could be a
+         * subroutine call instead of a property names, which currently isn't
+         * handled by this function.  Subroutine calls can't happen if there is
+         * an '=' in the name */
+        if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
+        {
+            return NULL;
+        }
+
+        starts_with_In_or_Is = TRUE;
+    }
+
+    lookup_len = j;     /* Use a more mnemonic name starting here */
+
+    /* Get the index into our pointer table of the inversion list corresponding
+     * to the property */
+    table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+
+    /* If it didn't find the property */
+    if (table_index == 0) {
+
+        /* If didn't find the property, we try again stripping off any initial
+         * 'In' or 'Is' */
+        if (starts_with_In_or_Is) {
+            lookup_name += 2;
+            lookup_len -= 2;
+            equals_pos -= 2;
+            slash_pos -= 2;
+
+            table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+        }
+
+        if (table_index == 0) {
+            char * canonical;
+
+            /* If not found, and not a numeric type property, isn't a legal
+             * property */
+            if (! is_nv_type) {
+                return NULL;
+            }
+
+            /* But the numeric type properties need more work to decide.  What
+             * we do is make sure we have the number in canonical form and look
+             * that up. */
+
+            if (slash_pos < 0) {    /* No slash */
+
+                /* When it isn't a rational, take the input, convert it to a
+                 * NV, then create a canonical string representation of that
+                 * NV. */
+
+                NV value;
+
+                /* Get the value */
+                if (my_atof3(lookup_name + equals_pos, &value,
+                             lookup_len - equals_pos)
+                          != lookup_name + lookup_len)
+                {
+                    return NULL;
+                }
+
+                /* If the value is an integer, the canonical value is integral */
+                if (Perl_ceil(value) == value) {
+                    canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
+                                                equals_pos, lookup_name, value);
+                }
+                else {  /* Otherwise, it is %e with a known precision */
+                    char * exp_ptr;
+
+                    canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
+                                                equals_pos, lookup_name,
+                                                PL_E_FORMAT_PRECISION, value);
+
+                    /* The exponent generated is expecting two digits, whereas
+                     * %e on some systems will generate three.  Remove leading
+                     * zeros in excess of 2 from the exponent.  We start
+                     * looking for them after the '=' */
+                    exp_ptr = strchr(canonical + equals_pos, 'e');
+                    if (exp_ptr) {
+                        char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
+                        SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
+
+                        assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
+
+                        if (excess_exponent_len > 0) {
+                            SSize_t leading_zeros = strspn(cur_ptr, "0");
+                            SSize_t excess_leading_zeros
+                                    = MIN(leading_zeros, excess_exponent_len);
+                            if (excess_leading_zeros > 0) {
+                                Move(cur_ptr + excess_leading_zeros,
+                                     cur_ptr,
+                                     strlen(cur_ptr) - excess_leading_zeros
+                                       + 1,  /* Copy the NUL as well */
+                                     char);
+                            }
+                        }
+                    }
+                }
+            }
+            else {  /* Has a slash.  Create a rational in canonical form  */
+                UV numerator, denominator, gcd, trial;
+                const char * end_ptr;
+                const char * sign = "";
+
+                /* We can't just find the numerator, denominator, and do the
+                 * division, then use the method above, because that is
+                 * inexact.  And the input could be a rational that is within
+                 * epsilon (given our precision) of a valid rational, and would
+                 * then incorrectly compare valid.
+                 *
+                 * We're only interested in the part after the '=' */
+                const char * this_lookup_name = lookup_name + equals_pos;
+                lookup_len -= equals_pos;
+                slash_pos -= equals_pos;
+
+                /* Handle any leading minus */
+                if (this_lookup_name[0] == '-') {
+                    sign = "-";
+                    this_lookup_name++;
+                    lookup_len--;
+                    slash_pos--;
+                }
+
+                /* Convert the numerator to numeric */
+                end_ptr = this_lookup_name + slash_pos;
+                if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
+                    return NULL;
+                }
+
+                /* It better have included all characters before the slash */
+                if (*end_ptr != '/') {
+                    return NULL;
+                }
+
+                /* Set to look at just the denominator */
+                this_lookup_name += slash_pos;
+                lookup_len -= slash_pos;
+                end_ptr = this_lookup_name + lookup_len;
+
+                /* Convert the denominator to numeric */
+                if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
+                    return NULL;
+                }
+
+                /* It better be the rest of the characters, and don't divide by
+                 * 0 */
+                if (   end_ptr != this_lookup_name + lookup_len
+                    || denominator == 0)
+                {
+                    return NULL;
+                }
+
+                /* Get the greatest common denominator using
+                   http://en.wikipedia.org/wiki/Euclidean_algorithm */
+                gcd = numerator;
+                trial = denominator;
+                while (trial != 0) {
+                    UV temp = trial;
+                    trial = gcd % trial;
+                    gcd = temp;
+                }
+
+                /* If already in lowest possible terms, we have already tried
+                 * looking this up */
+                if (gcd == 1) {
+                    return NULL;
+                }
+
+                /* Reduce the rational, which should put it in canonical form.
+                 * Then look it up */
+                numerator /= gcd;
+                denominator /= gcd;
+
+                canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
+                        equals_pos, lookup_name, sign, numerator, denominator);
+            }
+
+            /* Here, we have the number in canonical form.  Try that */
+            table_index = match_uniprop((U8 *) canonical, strlen(canonical));
+            if (table_index == 0) {
+                return NULL;
+            }
+        }
+    }
+
+    /* The return is an index into a table of ptrs.  A negative return
+     * signifies that the real index is the absolute value, but the result
+     * needs to be inverted */
+    if (table_index < 0) {
+        *invert = TRUE;
+        table_index = -table_index;
+    }
+    else {
+        *invert = FALSE;
+    }
+
+    /* Out-of band indices indicate a deprecated property.  The proper index is
+     * modulo it with the table size.  And dividing by the table size yields
+     * an offset into a table constructed to contain the corresponding warning
+     * message */
+    if (table_index > MAX_UNI_KEYWORD_INDEX) {
+        Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
+        table_index %= MAX_UNI_KEYWORD_INDEX;
+        Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
+                "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
+                (int) name_len, name, deprecated_property_msgs[warning_offset]);
+    }
+
+    /* In a few properties, a different property is used under /i.  These are
+     * unlikely to change, so are hard-coded here. */
+    if (to_fold) {
+        if (   table_index == UNI_XPOSIXUPPER
+            || table_index == UNI_XPOSIXLOWER
+            || table_index == UNI_TITLE)
+        {
+            table_index = UNI_CASED;
+        }
+        else if (   table_index == UNI_UPPERCASELETTER
+                 || table_index == UNI_LOWERCASELETTER
+#  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
+                 || table_index == UNI_TITLECASELETTER
+#  endif
+        ) {
+            table_index = UNI_CASEDLETTER;
+        }
+        else if (  table_index == UNI_POSIXUPPER
+                || table_index == UNI_POSIXLOWER)
+        {
+            table_index = UNI_POSIXALPHA;
+        }
+    }
+
+    /* Create and return the inversion list */
+    return _new_invlist_C_array(uni_prop_ptrs[table_index]);
+}
+
+#endif
+
 /*
  * ex: set ts=8 sts=4 sw=4 et:
  */