This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
maint policy: Separate build/installation issues from test failures
[perl5.git] / regcomp.c
index 7d5c50c..d1fa74d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,7 +87,6 @@ EXTERN_C const struct regexp_engine my_reg_engine;
 #endif
 
 #include "dquote_static.c"
-#include "charclass_invlists.h"
 #include "inline_invlist.c"
 #include "unicode_constants.h"
 
@@ -122,6 +121,12 @@ typedef struct scan_frame {
     struct scan_frame *next_frame;      /* next frame */
 } scan_frame;
 
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c)                                              \
+                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
+
 struct RExC_state_t {
     U32                flags;                  /* RXf_* are we folding, multilining? */
     U32                pm_flags;               /* PMf_* stuff from the calling PMOP */
@@ -515,6 +520,10 @@ static const scan_data_t zero_scan_data =
                 UTF8fARG(UTF, offset, RExC_precomp), \
                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 
+/* Used to point after bad bytes for an error message, but avoid skipping
+ * past a nul byte. */
+#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+
 /*
  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
  * arg. Show regex, up to a maximum length. If it's too long, chop and add
@@ -886,8 +895,6 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
     PerlIO_printf(Perl_debug_log,"\n");                              \
 });
 
-#ifdef DEBUGGING
-
 /* is c a control character for which we have a mnemonic? */
 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 
@@ -911,8 +918,6 @@ S_cntrl_to_mnemonic(const U8 c)
     return NULL;
 }
 
-#endif
-
 /* Mark that we cannot extend a found fixed substring at this point.
    Update the longest found anchored substring and the longest found
    floating substrings if needed. */
@@ -1914,7 +1919,7 @@ then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
 single traverse. We store a mapping from accepting to state to which word was
 matched, and then when we have multiple possibilities we try to complete the
-rest of the regex in the order in which they occured in the alternation.
+rest of the regex in the order in which they occurred in the alternation.
 
 The only prior NFA like behaviour that would be changed by the TRIE support is
 the silent ignoring of duplicate alternations which are of the form:
@@ -4380,7 +4385,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     ( flags & SCF_IN_DEFINE )
                     ||
                     (
-                        (is_inf_internal || is_inf || data->flags & SF_IS_INF)
+                        (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
                         &&
                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
                     )
@@ -5632,7 +5637,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
         }
         return final_minlen;
     }
-    NOT_REACHED;
+    NOT_REACHED; /* NOTREACHED */
 }
 
 STATIC U32
@@ -6407,7 +6412,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     DEBUG_r(if (!PL_colorset) reginitcolors());
 
-#ifndef PERL_IN_XSUB_RE
     /* Initialize these here instead of as-needed, as is quick and avoids
      * having to test them each time otherwise */
     if (! PL_AboveLatin1) {
@@ -6425,7 +6429,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
        PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
                                                     NUM_ANYOF_CODE_POINTS - 1);
     }
-#endif
 
     pRExC_state->code_blocks = NULL;
     pRExC_state->num_code_blocks = 0;
@@ -6877,7 +6880,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         SAVEFREEPV(RExC_recurse);
     }
 
-reStudy:
+  reStudy:
     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
     DEBUG_r(
         RExC_study_chunk_recursed_count= 0;
@@ -7888,7 +7891,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
                       (unsigned long) flags);
         }
-        NOT_REACHED; /* NOT REACHED */
+        NOT_REACHED; /* NOTREACHED */
     }
     return NULL;
 }
@@ -8007,27 +8010,6 @@ S__invlist_array_init(SV* const invlist, const bool will_have_0)
     return zero_addr + *offset;
 }
 
-PERL_STATIC_INLINE UV*
-S_invlist_array(SV* const invlist)
-{
-    /* Returns the pointer to the inversion list's array.  Every time the
-     * length changes, this needs to be called in case malloc or realloc moved
-     * it */
-
-    PERL_ARGS_ASSERT_INVLIST_ARRAY;
-
-    /* Must not be empty.  If these fail, you probably didn't check for <len>
-     * being non-zero before trying to get the array */
-    assert(_invlist_len(invlist));
-
-    /* The very first element always contains zero, The array begins either
-     * there, or if the inversion list is offset, at the element after it.
-     * The offset header field determines which; it contains 0 or 1 to indicate
-     * how much additionally to add */
-    assert(0 == *(SvPVX(invlist)));
-    return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
-}
-
 PERL_STATIC_INLINE void
 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
 {
@@ -8466,7 +8448,7 @@ Perl__invlist_populate_swatch(SV* const invlist,
             swatch[offset >> 3] |= 1 << (offset & 7);
         }
 
-    join_end_of_list:
+      join_end_of_list:
 
        /* Quit if at the end of the list */
         if (i >= len) {
@@ -9638,7 +9620,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                      : REGEX_DEPENDS_CHARSET;
                 has_charset_modifier = DEPENDS_PAT_MOD;
                 break;
-            excess_modifier:
+              excess_modifier:
                 RExC_parse++;
                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
@@ -9651,7 +9633,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
                 }
                 NOT_REACHED; /*NOTREACHED*/
-            neg_modifier:
+              neg_modifier:
                 RExC_parse++;
                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
                                     *(RExC_parse - 1));
@@ -9723,8 +9705,8 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 return;
                 /*NOTREACHED*/
             default:
-            fail_modifiers:
-                RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+              fail_modifiers:
+                RExC_parse += SKIP_IF_CHAR(RExC_parse);
                /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
@@ -9972,7 +9954,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     nextchar(pRExC_state);
                     return ret;
                 }
-                RExC_parse++;
+                --RExC_parse;
+                RExC_parse += SKIP_IF_CHAR(RExC_parse);
                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
                vFAIL3("Sequence (%.*s...) not recognized",
                                 RExC_parse-seqstart, seqstart);
@@ -10112,14 +10095,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 if (RExC_parse == RExC_end || *RExC_parse != ')')
                     vFAIL("Sequence (?&... not terminated");
                 goto gen_recurse_regop;
-                /* NOT REACHED */
+                /* NOTREACHED */
             case '+':
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse++;
                     vFAIL("Illegal pattern");
                 }
                 goto parse_recursion;
-                /* NOT REACHED*/
+                /* NOTREACHED*/
             case '-': /* (?-1) */
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse--; /* rewind to let it be handled later */
@@ -10132,12 +10115,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
               parse_recursion:
                 {
                     bool is_neg = FALSE;
+                    UV unum;
                     parse_start = RExC_parse - 1; /* MJD */
                     if (*RExC_parse == '-') {
                         RExC_parse++;
                         is_neg = TRUE;
                     }
-                    num = grok_atou(RExC_parse, &endptr);
+                    unum = grok_atou(RExC_parse, &endptr);
+                    num = (unum > I32_MAX) ? I32_MAX : (I32)unum;
                     if (endptr)
                        RExC_parse = (char*)endptr;
                     if (is_neg) {
@@ -10190,12 +10175,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 nextchar(pRExC_state);
                 return ret;
 
-            /* NOT REACHED */
+            /* NOTREACHED */
 
            case '?':           /* (??...) */
                is_logical = 1;
                if (*RExC_parse != '{') {
-                   RExC_parse++;
+                    RExC_parse += SKIP_IF_CHAR(RExC_parse);
                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
                     vFAIL2utf8f(
                         "Sequence (%"UTF8f"...) not recognized",
@@ -10311,8 +10296,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     ret = reganode(pRExC_state,NGROUPP,num);
                     goto insert_if_check_paren;
                }
-               else if (strnEQ(RExC_parse, "DEFINE",
-                                       MIN(DEFINE_len, RExC_end - RExC_parse)))
+               else if (RExC_end - RExC_parse >= DEFINE_len
+                        && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
                 {
                    ret = reganode(pRExC_state,DEFINEP,0);
                    RExC_parse += DEFINE_len;
@@ -10428,7 +10413,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
               parse_flags:
                parse_lparen_question_flags(pRExC_state);
                 if (UCHARAT(RExC_parse) != ':') {
-                    nextchar(pRExC_state);
+                    if (*RExC_parse)
+                        nextchar(pRExC_state);
                     *flagp = TRYAGAIN;
                     return NULL;
                 }
@@ -10461,6 +10447,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             Set_Node_Offset(ret, RExC_parse); /* MJD */
            is_open = 1;
        } else {
+            /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
+            paren = ':';
            ret = NULL;
        }
     }
@@ -10868,7 +10856,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 nextchar(pRExC_state);
             }
 
-       do_curly:
+         do_curly:
            if ((flags&SIMPLE)) {
                 MARK_NAUGHTY_EXP(2, 2);
                reginsert(pRExC_state, CURLY, ret, depth+1);
@@ -11619,7 +11607,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
     PERL_ARGS_ASSERT_REGATOM;
 
-tryagain:
+  tryagain:
     switch ((U8)*RExC_parse) {
     case '^':
        RExC_seen_zerolen++;
@@ -11657,7 +11645,7 @@ tryagain:
                        FALSE, /* means parse the whole char class */
                        TRUE, /* allow multi-char folds */
                        FALSE, /* don't silence non-portable warnings. */
-                       RExC_strict,
+                       (bool) RExC_strict,
                        NULL);
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
@@ -11779,49 +11767,121 @@ tryagain:
             arg = ANYOF_WORDCHAR;
             goto join_posix;
 
+       case 'B':
+            invert = 1;
+            /* FALLTHROUGH */
        case 'b':
+          {
+           regex_charset charset = get_regex_charset(RExC_flags);
+
            RExC_seen_zerolen++;
             RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = BOUND + get_regex_charset(RExC_flags);
-            if (op > BOUNDA) {  /* /aa is same as /a */
-                op = BOUNDA;
-            }
-            else if (op == BOUNDL) {
+           op = BOUND + charset;
+
+            if (op == BOUNDL) {
                 RExC_contains_locale = 1;
             }
+
            ret = reg_node(pRExC_state, op);
-           FLAGS(ret) = get_regex_charset(RExC_flags);
            *flagp |= SIMPLE;
-           if ((U8) *(RExC_parse + 1) == '{') {
-                /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
-           }
-           goto finish_meta_pat;
-       case 'B':
-           RExC_seen_zerolen++;
-            RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = NBOUND + get_regex_charset(RExC_flags);
-            if (op > NBOUNDA) { /* /aa is same as /a */
-                op = NBOUNDA;
-            }
-            else if (op == NBOUNDL) {
-                RExC_contains_locale = 1;
+           if (*(RExC_parse + 1) != '{') {
+                FLAGS(ret) = TRADITIONAL_BOUND;
+                if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDA;
+                }
             }
-           ret = reg_node(pRExC_state, op);
-           FLAGS(ret) = get_regex_charset(RExC_flags);
-           *flagp |= SIMPLE;
-           if ((U8) *(RExC_parse + 1) == '{') {
-                /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
+            else {
+                STRLEN length;
+                char name = *RExC_parse;
+                char * endbrace;
+                RExC_parse += 2;
+                endbrace = strchr(RExC_parse, '}');
+
+                if (! endbrace) {
+                    vFAIL2("Missing right brace on \\%c{}", name);
+                }
+                /* XXX Need to decide whether to take spaces or not.  Should be
+                 * consistent with \p{}, but that currently is SPACE, which
+                 * means vertical too, which seems wrong
+                 * while (isBLANK(*RExC_parse)) {
+                    RExC_parse++;
+                }*/
+                if (endbrace == RExC_parse) {
+                    RExC_parse++;  /* After the '}' */
+                    vFAIL2("Empty \\%c{}", name);
+                }
+                length = endbrace - RExC_parse;
+                /*while (isBLANK(*(RExC_parse + length - 1))) {
+                    length--;
+                }*/
+                switch (*RExC_parse) {
+                    case 'g':
+                        if (length != 1
+                            && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+                        {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = GCB_BOUND;
+                        break;
+                    case 's':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = SB_BOUND;
+                        break;
+                    case 'w':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = WB_BOUND;
+                        break;
+                    default:
+                      bad_bound_type:
+                        RExC_parse = endbrace;
+                       vFAIL2utf8f(
+                            "'%"UTF8f"' is an unknown bound type",
+                           UTF8fARG(UTF, length, endbrace - length));
+                        NOT_REACHED; /*NOTREACHED*/
+                }
+                RExC_parse = endbrace;
+                RExC_uni_semantics = 1;
+
+                if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDU;
+                    length += 4;
+
+                    /* Don't have to worry about UTF-8, in this message because
+                     * to get here the contents of the \b must be ASCII */
+                    ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
+                              "Using /u for '%.*s' instead of /%s",
+                              (unsigned) length,
+                              endbrace - length + 1,
+                              (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+                              ? ASCII_RESTRICT_PAT_MODS
+                              : ASCII_MORE_RESTRICT_PAT_MODS);
+                }
            }
+
+            if (PASS2 && invert) {
+                OP(ret) += NBOUND - BOUND;
+            }
            goto finish_meta_pat;
+          }
 
        case 'D':
             invert = 1;
             /* FALLTHROUGH */
        case 'd':
             arg = ANYOF_DIGIT;
-            goto join_posix;
+            if (! DEPENDS_SEMANTICS) {
+                goto join_posix;
+            }
+
+            /* \d doesn't have any matches in the upper Latin1 range, hence /d
+             * is equivalent to /u.  Changing to /u saves some branches at
+             * runtime */
+            op = POSIXU;
+            goto join_posix_op_known;
 
        case 'R':
            ret = reg_node(pRExC_state, LNBREAK);
@@ -11850,7 +11910,7 @@ tryagain:
        case 's':
             arg = ANYOF_SPACE;
 
-        join_posix:
+          join_posix:
 
            op = POSIXD + get_regex_charset(RExC_flags);
             if (op > POSIXA) {  /* /aa is same as /a */
@@ -11860,7 +11920,7 @@ tryagain:
                 RExC_contains_locale = 1;
             }
 
-        join_posix_op_known:
+          join_posix_op_known:
 
             if (invert) {
                 op += NPOSIXD - POSIXD;
@@ -11874,7 +11934,7 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
             /* FALLTHROUGH */
 
-         finish_meta_pat:
+          finish_meta_pat:
            nextchar(pRExC_state);
             Set_Node_Length(ret, 2); /* MJD */
            break;
@@ -11893,7 +11953,7 @@ tryagain:
                                FALSE, /* don't silence non-portable warnings.
                                          It would be a bug if these returned
                                          non-portables */
-                               RExC_strict,
+                               (bool) RExC_strict,
                                NULL);
                 /* regclass() can only return RESTART_UTF8 if multi-char folds
                    are allowed.  */
@@ -11930,7 +11990,7 @@ tryagain:
             }
             break;
        case 'k':    /* Handle \k<NAME> and \k'NAME' */
-       parse_named_seq:
+      parse_named_seq:
         {
             char ch= RExC_parse[1];
            if (ch != '<' && ch != '\'' && ch != '{') {
@@ -12020,19 +12080,37 @@ tryagain:
                 }
                 else {
                     num = S_backref_value(RExC_parse);
-                    /* bare \NNN might be backref or octal - if it is larger than or equal
-                     * RExC_npar then it is assumed to be and octal escape.
-                     * Note RExC_npar is +1 from the actual number of parens*/
-                    if (num == I32_MAX || (num > 9 && num >= RExC_npar
-                            && *RExC_parse != '8' && *RExC_parse != '9'))
+                    /* bare \NNN might be backref or octal - if it is larger
+                     * than or equal RExC_npar then it is assumed to be an
+                     * octal escape. Note RExC_npar is +1 from the actual
+                     * number of parens. */
+                    /* Note we do NOT check if num == I32_MAX here, as that is
+                     * handled by the RExC_npar check */
+
+                    if (
+                        /* any numeric escape < 10 is always a backref */
+                        num > 9
+                        /* any numeric escape < RExC_npar is a backref */
+                        && num >= RExC_npar
+                        /* cannot be an octal escape if it starts with 8 */
+                        && *RExC_parse != '8'
+                        /* cannot be an octal escape it it starts with 9 */
+                        && *RExC_parse != '9'
+                    )
                     {
-                        /* Probably a character specified in octal, e.g. \35 */
+                        /* Probably not a backref, instead likely to be an
+                         * octal character escape, e.g. \35 or \777.
+                         * The above logic should make it obvious why using
+                         * octal escapes in patterns is problematic. - Yves */
                         goto defchar;
                     }
                 }
 
-                /* at this point RExC_parse definitely points to a backref
-                 * number */
+                /* At this point RExC_parse points at a numeric escape like
+                 * \12 or \88 or something similar, which we should NOT treat
+                 * as an octal escape. It may or may not be a valid backref
+                 * escape. For instance \88888888 is unlikely to be a valid
+                 * backref. */
                {
 #ifdef RE_TRACK_PATTERN_OFFSETS
                    char * const parse_start = RExC_parse - 1; /* MJD */
@@ -12096,7 +12174,7 @@ tryagain:
 
            RExC_parse++;
 
-       defchar: {
+         defchar: {
            STRLEN len = 0;
            UV ender = 0;
            char *p;
@@ -12135,7 +12213,7 @@ tryagain:
 
             s0 = s;
 
-       reparse:
+         reparse:
 
             /* We do the EXACTFish to EXACT node only if folding.  (And we
              * don't need to figure this out until pass 2) */
@@ -12268,7 +12346,7 @@ tryagain:
                                                       &result,
                                                       &error_msg,
                                                       PASS2, /* out warnings */
-                                                       RExC_strict,
+                                                       (bool) RExC_strict,
                                                        TRUE, /* Output warnings
                                                                 for non-
                                                                 portables */
@@ -12297,7 +12375,7 @@ tryagain:
                                                       &result,
                                                       &error_msg,
                                                       PASS2, /* out warnings */
-                                                       RExC_strict,
+                                                       (bool) RExC_strict,
                                                        TRUE, /* Silence warnings
                                                                 for non-
                                                                 portables */
@@ -12323,6 +12401,9 @@ tryagain:
                        break;
                     case '8': case '9': /* must be a backreference */
                         --p;
+                        /* we have an escape like \8 which cannot be an octal escape
+                         * so we exit the loop, and let the outer loop handle this
+                         * escape which may or may not be a legitimate backref. */
                         goto loopdone;
                     case '1': case '2': case '3':case '4':
                    case '5': case '6': case '7':
@@ -12371,7 +12452,7 @@ tryagain:
                        if (IN_ENCODING && ender < 0x100)
                            goto recode_encoding;
                        break;
-                   recode_encoding:
+                     recode_encoding:
                        if (! RExC_override_recoding) {
                            SV* enc = _get_encoding();
                            ender = reg_recode((const char)(U8)ender, &enc);
@@ -12754,8 +12835,8 @@ tryagain:
                 }
            }   /* End of verifying node ends with an appropriate char */
 
-       loopdone:   /* Jumped to when encounters something that shouldn't be in
-                      the node */
+          loopdone:   /* Jumped to when encounters something that shouldn't be
+                         in the node */
 
             /* I (khw) don't know if you can get here with zero length, but the
              * old code handled this situation by creating a zero-length EXACT
@@ -12970,7 +13051,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
                            break;
                        case 'e':
                            if (memEQ(posixcc, "spac", 4)) /* space */
-                               namedclass = ANYOF_PSXSPC;
+                               namedclass = ANYOF_SPACE;
                            break;
                        case 'h':
                            if (memEQ(posixcc, "grap", 4)) /* graph */
@@ -13223,7 +13304,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
             RExC_parse++;
         }
 
-        no_close:
+      no_close:
         FAIL("Syntax error in (?[...])");
     }
 
@@ -13886,6 +13967,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
        const char *s = RExC_parse;
        const char  c = *s++;
 
+        if (*s == '^') {
+            s++;
+        }
        while (isWORDCHAR(*s))
            s++;
        if (*s && c == *s && s[1] == ']') {
@@ -13921,7 +14005,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             break;
         }
 
-    charclassloop:
+      charclassloop:
 
        namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
         save_value = value;
@@ -14288,7 +14372,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                        goto recode_encoding;
                    break;
                }
-           recode_encoding:
+             recode_encoding:
                if (! RExC_override_recoding) {
                    SV* enc = _get_encoding();
                    value = reg_recode((const char)(U8)value, &enc);
@@ -14530,7 +14614,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 vFAIL2utf8f(
                     "Invalid [] range \"%"UTF8f"\"",
                     UTF8fARG(UTF, w, rangebegin));
-                NOT_REACHED; /* NOT REACHED */
+                NOT_REACHED; /* NOTREACHED */
            }
        }
        else {
@@ -14664,11 +14748,91 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
                 }
+                else if (prevvalue != value) {
+
+                    /* Under strict, ranges that stop and/or end in an ASCII
+                     * printable should have each end point be a portable value
+                     * for it (preferably like 'A', but we don't warn if it is
+                     * a (portable) Unicode name or code point), and the range
+                     * must be be all digits or all letters of the same case.
+                     * Otherwise, the range is non-portable and unclear as to
+                     * what it contains */
+                    if ((isPRINT_A(prevvalue) || isPRINT_A(value))
+                        && (non_portable_endpoint
+                            || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
+                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
+                                   || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
+                    {
+                        vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+                    }
+                    else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+
+                        /* But the nature of Unicode and languages mean we
+                         * can't do the same checks for above-ASCII ranges,
+                         * except in the case of digit ones.  These should
+                         * contain only digits from the same group of 10.  The
+                         * ASCII case is handled just above.  0x660 is the
+                         * first digit character beyond ASCII.  Hence here, the
+                         * range could be a range of digits.  Find out.  */
+                        IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                         prevvalue);
+                        IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                         value);
+
+                        /* If the range start and final points are in the same
+                         * inversion list element, it means that either both
+                         * are not digits, or both are digits in a consecutive
+                         * sequence of digits.  (So far, Unicode has kept all
+                         * such sequences as distinct groups of 10, but assert
+                         * to make sure).  If the end points are not in the
+                         * same element, neither should be a digit. */
+                        if (index_start == index_final) {
+                            assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+                            || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+                            - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+                            == 10);
+                        }
+                        else if ((index_start >= 0
+                                  && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
+                                 || (index_final >= 0
+                                     && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+                        {
+                            vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
+                        }
+                    }
+                }
+            }
+            if ((! range || prevvalue == value) && non_portable_endpoint) {
+                if (isPRINT_A(value)) {
+                    char literal[3];
+                    unsigned d = 0;
+                    if (isBACKSLASHED_PUNCT(value)) {
+                        literal[d++] = '\\';
+                    }
+                    literal[d++] = (char) value;
+                    literal[d++] = '\0';
+
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
+                           (int) (RExC_parse - rangebegin),
+                           rangebegin,
+                           literal
+                        );
+                }
+                else if isMNEMONIC_CNTRL(value) {
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
+                           (int) (RExC_parse - rangebegin),
+                           rangebegin,
+                           cntrl_to_mnemonic((char) value)
+                        );
+                }
             }
         }
 
         /* Deal with this element of the class */
        if (! SIZE_ONLY) {
+
 #ifndef EBCDIC
             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
                                                      prevvalue, value);
@@ -14836,7 +15000,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                 /* The actual POSIXish node for all the rest depends on the
                  * charset modifier.  The ones in the first set depend only on
-                 * ASCII or, if available on this platform, locale */
+                 * ASCII or, if available on this platform, also locale */
                 case ANYOF_ASCII:
                 case ANYOF_NASCII:
 #ifdef HAS_ISASCII
@@ -14846,19 +15010,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 #endif
                     goto join_posix;
 
-                case ANYOF_NCASED:
+                /* The following don't have any matches in the upper Latin1
+                 * range, hence /d is equivalent to /u for them.  Making it /u
+                 * saves some branches at runtime */
+                case ANYOF_DIGIT:
+                case ANYOF_NDIGIT:
+                case ANYOF_XDIGIT:
+                case ANYOF_NXDIGIT:
+                    if (! DEPENDS_SEMANTICS) {
+                        goto treat_as_default;
+                    }
+
+                    op = POSIXU;
+                    goto join_posix;
+
+                /* The following change to CASED under /i */
                 case ANYOF_LOWER:
                 case ANYOF_NLOWER:
                 case ANYOF_UPPER:
                 case ANYOF_NUPPER:
-                    /* under /a could be alpha */
                     if (FOLD) {
-                        if (ASCII_RESTRICTED) {
-                            namedclass = ANYOF_ALPHA + (namedclass % 2);
-                        }
-                        else if (! LOC) {
-                            break;
-                        }
+                        namedclass = ANYOF_CASED + (namedclass % 2);
                     }
                     /* FALLTHROUGH */
 
@@ -14866,12 +15038,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  * We take advantage of the enum ordering of the charset
                  * modifiers to get the exact node type, */
                 default:
+                  treat_as_default:
                     op = POSIXD + get_regex_charset(RExC_flags);
                     if (op > POSIXA) { /* /aa is same as /a */
                         op = POSIXA;
                     }
 
-                join_posix:
+                  join_posix:
                     /* The odd numbered ones are the complements of the
                      * next-lower even number one */
                     if (namedclass % 2 == 1) {
@@ -14914,7 +15087,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 if (prevvalue == 'A') {
                     if (value == 'Z'
 #ifdef EBCDIC
-                        && ! non_portable_end_point
+                        && ! non_portable_endpoint
 #endif
                     ) {
                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
@@ -14924,7 +15097,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 else if (prevvalue == 'a') {
                     if (value == 'z'
 #ifdef EBCDIC
-                        && ! non_portable_end_point
+                        && ! non_portable_endpoint
 #endif
                     ) {
                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
@@ -15299,6 +15472,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     }
 
     if (ret_invlist) {
+        assert(cp_list);
+
         *ret_invlist = cp_list;
         SvREFCNT_dec(swash);
 
@@ -16306,8 +16481,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
-    || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
-    || _CC_VERTSPACE != 16
+    || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
   #error Need to adjust order of anyofs[]
 #endif
         "\\w",
@@ -16336,8 +16510,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         "[:^blank:]",
         "[:xdigit:]",
         "[:^xdigit:]",
-        "[:space:]",
-        "[:^space:]",
         "[:cntrl:]",
         "[:^cntrl:]",
         "[:ascii:]",
@@ -16588,7 +16760,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                         sv_catpv(sv, t);
                     }
 
-                out_dump:
+                  out_dump:
 
                     Safefree(origs);
                     SvREFCNT_dec_NN(lv);
@@ -16636,6 +16808,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
         }
     }
+    else if (k == BOUND || k == NBOUND) {
+        /* Must be synced with order of 'bound_type' in regcomp.h */
+        const char * const bounds[] = {
+            "",      /* Traditional */
+            "{gcb}",
+            "{sb}",
+            "{wb}"
+        };
+        sv_catpv(sv, bounds[FLAGS(o)]);
+    }
     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
        Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
     else if (OP(o) == SBOL)
@@ -17215,10 +17397,6 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
 }
 
 #ifdef DEBUGGING
-/* Certain characters are output as a sequence with the first being a
- * backslash. */
-#define isBACKSLASHED_PUNCT(c)                                              \
-                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 
 STATIC void
 S_put_code_point(pTHX_ SV *sv, UV c)