This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp: Add warning if tries to use \p in locale.
[perl5.git] / regcomp.c
index 341ac74..015b4ff 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -134,10 +134,14 @@ typedef struct RExC_state_t {
     I32                orig_utf8;      /* whether the pattern was originally in utf8 */
                                /* XXX use this for future optimisation of case
                                 * where pattern must be upgraded to utf8. */
+    I32                uni_semantics;  /* If a d charset modifier should use unicode
+                                  rules, even if the pattern is not in
+                                  utf8 */
     HV         *paren_names;           /* Paren names */
     
     regnode    **recurse;              /* Recurse regops */
     I32                recurse_count;          /* Number of recurse regops */
+    I32                in_lookbehind;
 #if ADD_TO_REGEXEC
     char       *starttry;              /* -Dr: where regtry was called. */
 #define RExC_starttry  (pRExC_state->starttry)
@@ -177,6 +181,7 @@ typedef struct RExC_state_t {
 #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
 #define RExC_seen_evals        (pRExC_state->seen_evals)
 #define RExC_utf8      (pRExC_state->utf8)
+#define RExC_uni_semantics     (pRExC_state->uni_semantics)
 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 #define RExC_open_parens       (pRExC_state->open_parens)
 #define RExC_close_parens      (pRExC_state->close_parens)
@@ -184,6 +189,7 @@ typedef struct RExC_state_t {
 #define RExC_paren_names       (pRExC_state->paren_names)
 #define RExC_recurse   (pRExC_state->recurse)
 #define RExC_recurse_count     (pRExC_state->recurse_count)
+#define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
 
 
 #define        ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
@@ -200,7 +206,7 @@ typedef struct RExC_state_t {
 #define        HASWIDTH        0x01    /* Known to match non-null strings. */
 
 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant. */
+ * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 #define        SIMPLE          0x02
 #define        SPSTART         0x04    /* Starts with * or +. */
 #define TRYAGAIN       0x08    /* Weeded out a declaration. */
@@ -371,6 +377,9 @@ static const scan_data_t zero_scan_data =
 #define UTF cBOOL(RExC_utf8)
 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
+#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 
 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
@@ -3632,7 +3641,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                            if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == ALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (!isWORDCHAR_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3650,7 +3659,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                        else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                        else if (OP(scan) == ALNUMU) {
                             for (value = 0; value < 256; value++) {
                                 if (isWORDCHAR_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3669,7 +3678,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                            if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == NALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (isWORDCHAR_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3688,9 +3697,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
                        else {
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);
+                            if (OP(scan) == NALNUMU) {
+                                for (value = 0; value < 256; value++) {
+                                    if (! isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_SET(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (! isALNUM(value)) {
+                                        ANYOF_BITMAP_SET(data->start_class, value);
+                                    }
+                                }
+                           }
                        }
                    }
                    break;
@@ -3698,7 +3717,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                           if (OP(scan) == SPACEU) {
                                 for (value = 0; value < 256; value++) {
                                     if (!isSPACE_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3717,7 +3736,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         if (data->start_class->flags & ANYOF_LOCALE) {
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
                         }
-                        else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                        else if (OP(scan) == SPACEU) {
                             for (value = 0; value < 256; value++) {
                                 if (isSPACE_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3736,7 +3755,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                            if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == NSPACEU) {
                                 for (value = 0; value < 256; value++) {
                                     if (isSPACE_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3754,7 +3773,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                        else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                        else if (OP(scan) == NSPACEU) {
                             for (value = 0; value < 256; value++) {
                                 if (!isSPACE_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -4377,6 +4396,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     DEBUG_r(if (!PL_colorset) reginitcolors());
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    RExC_uni_semantics = 0;
 
     /****************** LONG JUMP TARGET HERE***********************/
     /* Longjmp back to here if have to switch in midstream to utf8 */
@@ -4446,6 +4466,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     RExC_sawback = 0;
 
     RExC_seen = 0;
+    RExC_in_lookbehind = 0;
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
     RExC_seen_evals = 0;
     RExC_extralen = 0;
@@ -4484,6 +4505,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     if (used_setjump) {
        JMPENV_POP;
     }
+
     DEBUG_PARSE_r({
         PerlIO_printf(Perl_debug_log, 
             "Required size %"IVdf" nodes\n"
@@ -4492,6 +4514,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
         RExC_lastnum=0; 
         RExC_lastparse=NULL; 
     });
+
+    /* The first pass could have found things that force Unicode semantics */
+    if ((RExC_utf8 || RExC_uni_semantics)
+        && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
+    {
+       set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+    }
+
     /* Small enough for pointer-storage convention?
        If extralen==0, this means that we will not need long jumps. */
     if (RExC_size >= 0x10000L && RExC_extralen)
@@ -5089,13 +5119,13 @@ reStudy:
     else {
         regnode *first = ri->program + 1;
         U8 fop = OP(first);
-        U8 nop = OP(NEXTOPER(first));
-        
-        if (PL_regkind[fop] == NOTHING && nop == END)
+
+        if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
             r->extflags |= RXf_NULL;
-        else if (PL_regkind[fop] == BOL && nop == END)
+        else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
             r->extflags |= RXf_START_ONLY;
-        else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
+        else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
+                            && OP(regnext(first)) == END)
             r->extflags |= RXf_WHITE;    
     }
 #endif
@@ -5925,6 +5955,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                    goto capturing_parens;
                }
                 RExC_seen |= REG_SEEN_LOOKBEHIND;
+               RExC_in_lookbehind++;
                RExC_parse++;
            case '=':           /* (?=...) */
                RExC_seen_zerolen++;
@@ -6257,10 +6288,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                       that follow */
                 has_use_defaults = TRUE;
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
-               if (RExC_utf8) {    /* But the default for a utf8 pattern is
-                                      unicode semantics */
-                   set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
-               }
+               set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
+                                               ? REGEX_UNICODE_CHARSET
+                                               : REGEX_DEPENDS_CHARSET);
                 goto parse_flags;
            default:
                --RExC_parse;
@@ -6291,6 +6321,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        cs = REGEX_UNICODE_CHARSET;
                         has_charset_modifier = 1;
                         break;
+                    case ASCII_RESTRICT_PAT_MOD:
+                        if (has_charset_modifier || flagsp == &negflags) {
+                            goto fail_modifiers;
+                        }
+                       cs = REGEX_ASCII_RESTRICTED_CHARSET;
+                        has_charset_modifier = 1;
+                        break;
                     case DEPENDS_PAT_MOD:
                         if (has_use_defaults
                             || has_charset_modifier
@@ -6301,8 +6338,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
                        /* The dual charset means unicode semantics if the
                         * pattern (or target, not known until runtime) are
-                        * utf8 */
-                       cs = (RExC_utf8)
+                        * utf8, or something in the pattern indicates unicode
+                        * semantics */
+                       cs = (RExC_utf8 || RExC_uni_semantics)
                             ? REGEX_UNICODE_CHARSET
                             : REGEX_DEPENDS_CHARSET;
                         has_charset_modifier = 1;
@@ -6565,6 +6603,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            FAIL("Junk on end of regexp");      /* "Can't happen". */
        /* NOTREACHED */
     }
+
+    if (RExC_in_lookbehind) {
+       RExC_in_lookbehind--;
+    }
     if (after_freeze)
         RExC_npar = after_freeze;
     return(ret);
@@ -7181,6 +7223,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     register regnode *ret = NULL;
     I32 flags;
     char *parse_start = RExC_parse;
+    U8 op;
     GET_RE_DEBUG_FLAGS_DECL;
     DEBUG_PARSE("atom");
     *flagp = WORST;            /* Tentatively. */
@@ -7352,77 +7395,165 @@ tryagain:
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
-           if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(ALNUML));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(ALNUM));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = ALNUML;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = ALNUMU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = ALNUMA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = ALNUM;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NALNUML));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NALNUM));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NALNUML;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NALNUMU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NALNUMA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NALNUM;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(BOUNDL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(BOUND));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = BOUNDL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = BOUNDU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = BOUNDA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = BOUND;
+                   break;
+               default:
+                   goto bad_charset;
             }
+           ret = reg_node(pRExC_state, op);
            FLAGS(ret) = get_regex_charset(RExC_flags);
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NBOUND));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NBOUNDL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NBOUNDU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NBOUNDA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NBOUND;
+                   break;
+               default:
+                   goto bad_charset;
             }
+           ret = reg_node(pRExC_state, op);
            FLAGS(ret) = get_regex_charset(RExC_flags);
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(SPACEL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(SPACE));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = SPACEL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = SPACEU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = SPACEA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = SPACE;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NSPACEL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NSPACE));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NSPACEL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NSPACEU;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NSPACEA;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NSPACE;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(DIGITL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(DIGIT));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = DIGITL;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = DIGITA;
+                   break;
+               case REGEX_DEPENDS_CHARSET: /* No difference between these */
+               case REGEX_UNICODE_CHARSET:
+                   op = DIGIT;
+                   break;
+               default:
+                   goto bad_charset;
             }
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'D':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NDIGITL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NDIGIT));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NDIGITL;
+                   break;
+               case REGEX_ASCII_RESTRICTED_CHARSET:
+                   op = NDIGITA;
+                   break;
+               case REGEX_DEPENDS_CHARSET: /* No difference between these */
+               case REGEX_UNICODE_CHARSET:
+                   op = NDIGIT;
+                   break;
+               default:
+                   goto bad_charset;
             }
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'R':
@@ -7521,7 +7652,7 @@ tryagain:
                 ret = reganode(pRExC_state,
                                ((! FOLD)
                                  ? NREF
-                                 : (UNI_SEMANTICS)
+                                 : (AT_LEAST_UNI_SEMANTICS)
                                    ? NREFFU
                                    : (LOC)
                                      ? NREFFL
@@ -7589,7 +7720,7 @@ tryagain:
                    ret = reganode(pRExC_state,
                                   ((! FOLD)
                                     ? REF
-                                    : (UNI_SEMANTICS)
+                                    : (AT_LEAST_UNI_SEMANTICS)
                                       ? REFFU
                                       : (LOC)
                                         ? REFFL
@@ -7643,7 +7774,7 @@ tryagain:
                           (U8) ((! FOLD) ? EXACT
                                          : (LOC)
                                             ? EXACTFL
-                                            : (UNI_SEMANTICS)
+                                            : (AT_LEAST_UNI_SEMANTICS)
                                               ? EXACTFU
                                               : EXACTF)
                    );
@@ -7952,6 +8083,11 @@ tryagain:
     }
 
     return(ret);
+
+/* Jumped to when an unrecognized character set is encountered */
+bad_charset:
+    Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
+    return(NULL);
 }
 
 STATIC char *
@@ -8160,18 +8296,18 @@ case ANYOF_N##NAME:                                                            \
  * there are two tests passed in, to use depending on that. There aren't any
  * cases where the label is different from the name, so no need for that
  * parameter */
-#define _C_C_T_(NAME,TEST_8,TEST_7,WORD)                                       \
+#define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
 ANYOF_##NAME:                                                                  \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
-            if (TEST_8) stored +=                                              \
+            if (TEST_8(value)) stored +=                                       \
                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
-            if (TEST_7) stored +=                                              \
+            if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
                S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
                                   (U8) UNI_TO_NATIVE(value));                 \
         }                                                                      \
@@ -8183,21 +8319,30 @@ case ANYOF_N##NAME:                                                            \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
-            if (! TEST_8) stored +=                                            \
+            if (! TEST_8(value)) stored +=                                     \
                    S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
-            if (! TEST_7) stored +=                                            \
-                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
+            if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit(  \
+                       aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value));    \
         }                                                                      \
-       /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
-        * Latin1 code points match the complement of any of the classes.  But \
-        * in utf8, they have their Unicode semantics, so can't just set them  \
-        * in the bitmap, or else regexec.c will think they matched when they  \
-        * shouldn't. */                                                       \
-       ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;              \
+       if (ASCII_RESTRICTED) {                                                \
+           for (value = 128; value < 256; value++) {                          \
+             stored += S_set_regclass_bit(                                     \
+                          aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
+           }                                                                  \
+           ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
+       }                                                                      \
+       else {                                                                 \
+           /* For a non-ut8 target string with DEPENDS semantics, all above   \
+            * ASCII Latin1 code points match the complement of any of the     \
+            * classes.  But in utf8, they have their Unicode semantics, so    \
+            * can't just set them in the bitmap, or else regexec.c will think \
+            * they matched when they shouldn't. */                            \
+           ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
+       }                                                                      \
     }                                                                          \
     yesno = '!';                                                               \
     what = WORD;                                                               \
@@ -8237,7 +8382,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
     U8 stored = 0;
     U8 fold;
 
-    fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+    fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
                            : PL_fold[value];
 
     /* It assumes the bit for 'value' has already been set */
@@ -8450,7 +8595,14 @@ parseit:
                    e = RExC_parse;
                    n = 1;
                }
-               if (!SIZE_ONLY) {
+               if (SIZE_ONLY) {
+                   if (LOC) {
+                       ckWARN2reg(RExC_parse,
+                               "\\%c uses Unicode rules, not locale rules",
+                               (int) value);
+                   }
+               }
+               else {
                    if (UCHARAT(RExC_parse) == '^') {
                         RExC_parse++;
                         n--;
@@ -8468,10 +8620,10 @@ parseit:
                /* The \p could match something in the Latin1 range, hence
                 * something that isn't utf8 */
                ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
-               if (FOLD) { /* And one of these could have a multi-char fold */
-                   OP(ret) = ANYOFV;
-               }
                namedclass = ANYOF_MAX;  /* no official name, but it's named */
+
+               /* \p means they want Unicode semantics */
+               RExC_uni_semantics = 1;
                }
                break;
            case 'n':   value = '\n';                   break;
@@ -8622,26 +8774,26 @@ parseit:
                 * --jhi */
                switch ((I32)namedclass) {
                
-               case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
-               case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
-               case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
-               case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
-               case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
-               case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
-               case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
-               case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
-               case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
-               case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
+               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
+               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
+               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
+               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
+               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
+               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
+               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
+               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
+               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
+               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
                 /* \s, \w match all unicode if utf8. */
-                case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
+                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
 #else
                 /* \s, \w match ascii and locale only */
-                case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
+                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
 #endif         
-               case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
+               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                case ANYOF_ASCII:
@@ -8664,6 +8816,7 @@ parseit:
                            stored +=
                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
+                   ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
                    yesno = '!';
                    what = "ASCII";
                    break;              
@@ -8693,6 +8846,9 @@ parseit:
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
+                   if (ASCII_RESTRICTED ) {
+                       ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+                   }
                    break;              
                case ANYOF_MAX:
                    /* this is to handle \p and \P */
@@ -8701,7 +8857,7 @@ parseit:
                    vFAIL("Invalid [::] class");
                    break;
                }
-               if (what) {
+               if (what && ! (ASCII_RESTRICTED)) {
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
                    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
@@ -8795,8 +8951,10 @@ parseit:
 
                    /* Currently, we don't look at every value in the range.
                     * Therefore we have to assume the worst case: that if
-                    * folding, it will match more than one character */
-                   if (FOLD) {
+                    * folding, it will match more than one character.  But in
+                    * lookbehind patterns, can only be single character
+                    * length, so disallow those folds */
+                   if (FOLD && ! RExC_in_lookbehind) {
                      OP(ret) = ANYOFV;
                    }
                }
@@ -8830,8 +8988,9 @@ parseit:
 #endif
                                  Perl_sv_catpvf(aTHX_ listsv,
                                                 "%04"UVxf"\n", f);
-                             else {
+                             else if (! RExC_in_lookbehind) {
                                  /* Any multicharacter foldings
+                                  * (disallowed in lookbehind patterns)
                                   * require the following transform:
                                   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
                                   * where E folds into "pq" and F folds
@@ -8906,8 +9065,10 @@ parseit:
        /* This is the one character in the bitmap that needs special handling
         * under non-locale folding, as it folds to two characters 'ss'.  This
         * happens if it is set and not inverting, or isn't set and are
-        * inverting */
+        * inverting (disallowed in lookbehind patterns because they can't be
+        * variable length) */
        if (! LOC
+           && ! RExC_in_lookbehind
            && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
                ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
        {
@@ -8987,7 +9148,7 @@ parseit:
                op = EXACT;
            }
        }   /* else 2 chars in the bit map: the folds of each other */
-       else if (UNI_SEMANTICS || !isASCII(value)) {
+       else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
 
            /* To join adjacent nodes, they must be the exact EXACTish type.
             * Try to use the most likely type, by using EXACTFU if the regex
@@ -9487,6 +9648,9 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
                 case REGEX_LOCALE_CHARSET:
                     PerlIO_printf(Perl_debug_log, "LOCALE");
                     break;
+                case REGEX_ASCII_RESTRICTED_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+                    break;
                 default:
                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
                     break;