This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regex: Add pseudo-Posix class: 'cased'
[perl5.git] / regcomp.c
index eee952f..86f6a29 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -749,6 +749,17 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
     DEBUG_STUDYDATA("commit: ",data,0);
 }
 
+/* These macros set, clear and test whether the synthetic start class ('ssc',
+ * given by the parameter) matches an empty string (EOS).  This uses the
+ * 'next_off' field in the node, to save a bit in the flags field.  The ssc
+ * stands alone, so there is never a next_off, so this field is otherwise
+ * unused.  The EOS information is used only for compilation, but theoretically
+ * it could be passed on to the execution code.  This could be used to store
+ * more than one bit of information, but only this one is currently used. */
+#define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
+#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
+#define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
+
 /* Can match anything (initialization) */
 STATIC void
 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
@@ -756,8 +767,8 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
     PERL_ARGS_ASSERT_CL_ANYTHING;
 
     ANYOF_BITMAP_SETALL(cl);
-    cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
-               |ANYOF_NON_UTF8_LATIN1_ALL;
+    cl->flags = ANYOF_UNICODE_ALL;
+    SET_SSC_EOS(cl);
 
     /* If any portion of the regex is to operate under locale rules,
      * initialization includes it.  The reason this isn't done for all regexes
@@ -768,7 +779,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
      * necessary. */
     if (RExC_contains_locale) {
        ANYOF_CLASS_SETALL(cl);     /* /l uses class */
-       cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
+       cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
     }
     else {
        ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
@@ -817,7 +828,7 @@ S_cl_and(struct regnode_charclass_class *cl,
 {
     PERL_ARGS_ASSERT_CL_AND;
 
-    assert(and_with->type == ANYOF);
+    assert(PL_regkind[and_with->type] == ANYOF);
 
     /* I (khw) am not sure all these restrictions are necessary XXX */
     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
@@ -3141,7 +3152,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        StructCopy(&accum, data->start_class,
                                   struct regnode_charclass_class);
                        flags |= SCF_DO_STCLASS_OR;
-                       data->start_class->flags |= ANYOF_EOS;
+                        SET_SSC_EOS(data->start_class);
                    }
                }
 
@@ -3549,7 +3560,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 * utf8 string, so accept a possible false positive for
                 * latin1-range folds */
                if (uc >= 0x100 ||
-                   (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+                   (!(data->start_class->flags & ANYOF_LOCALE)
                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
                    && (!(data->start_class->flags & ANYOF_LOC_FOLD)
                        || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
@@ -3577,7 +3588,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        }
                    }
                }
-               data->start_class->flags &= ~ANYOF_EOS;
+                CLEAR_SSC_EOS(data->start_class);
                if (uc < 0x100)
                  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
            }
@@ -3587,7 +3598,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    ANYOF_BITMAP_SET(data->start_class, uc);
                else
                    data->start_class->flags |= ANYOF_UNICODE_ALL;
-               data->start_class->flags &= ~ANYOF_EOS;
+                CLEAR_SSC_EOS(data->start_class);
                cl_and(data->start_class, and_withp);
            }
            flags &= ~SCF_DO_STCLASS;
@@ -3626,7 +3637,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
                if (uc >= 0x100 ||
-                (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+                (!(data->start_class->flags & ANYOF_LOCALE)
                  && !ANYOF_BITMAP_TEST(data->start_class, uc)
                  && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                {
@@ -3636,7 +3647,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                ANYOF_BITMAP_ZERO(data->start_class);
                if (compat) {
                    ANYOF_BITMAP_SET(data->start_class, uc);
-                   data->start_class->flags &= ~ANYOF_EOS;
+                    CLEAR_SSC_EOS(data->start_class);
                    if (OP(scan) == EXACTFL) {
                        /* XXX This set is probably no longer necessary, and
                         * probably wrong as LOCALE now is on in the initial
@@ -3703,7 +3714,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                            }
                         }
                    }
-                   data->start_class->flags &= ~ANYOF_EOS;
+                    CLEAR_SSC_EOS(data->start_class);
                }
                cl_and(data->start_class, and_withp);
            }
@@ -3820,7 +3831,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        StructCopy(&this_class, data->start_class,
                                   struct regnode_charclass_class);
                        flags |= SCF_DO_STCLASS_OR;
-                       data->start_class->flags |= ANYOF_EOS;
+                        SET_SSC_EOS(data->start_class);
                    }
                } else {                /* Non-zero len */
                    if (flags & SCF_DO_STCLASS_OR) {
@@ -4086,7 +4097,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        else if (OP(scan) == LNBREAK) {
            if (flags & SCF_DO_STCLASS) {
                int value = 0;
-               data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
+                CLEAR_SSC_EOS(data->start_class); /* No match on empty */
                if (flags & SCF_DO_STCLASS_AND) {
                     for (value = 0; value < 256; value++)
                         if (!is_VERTWS_cp(value))
@@ -4120,7 +4131,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            min++;
            if (flags & SCF_DO_STCLASS) {
                 int loop_max = 256;
-               data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
+                CLEAR_SSC_EOS(data->start_class); /* No match on empty */
 
                /* Some of the logic below assumes that switching
                   locale on will only add false positives. */
@@ -4129,8 +4140,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                case SANY:
                default:
-                 do_default:
-                   /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
+#ifdef DEBUGGING
+                   Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
+#endif
+                 do_default:
                    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
                        cl_anything(pRExC_state, data->start_class);
                    break;
@@ -4139,7 +4152,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        goto do_default;
                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
-                                || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
+                               || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
                        cl_anything(pRExC_state, data->start_class);
                    }
                    if (flags & SCF_DO_STCLASS_AND || !value)
@@ -4155,6 +4168,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    break;
                case POSIXA:
                     loop_max = 128;
+                    /* FALL THROUGH */
                case POSIXL:
                case POSIXD:
                case POSIXU:
@@ -4188,6 +4202,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    break;
                case NPOSIXA:
                     loop_max = 128;
+                    /* FALL THROUGH */
                case NPOSIXL:
                case NPOSIXU:
                case NPOSIXD:
@@ -4324,11 +4339,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        cl_init(pRExC_state, data->start_class);
                    }  else {
                        /* AND before and after: combine and continue */
-                       const int was = (data->start_class->flags & ANYOF_EOS);
+                       const int was = TEST_SSC_EOS(data->start_class);
 
                        cl_and(data->start_class, &intrnl);
                        if (was)
-                           data->start_class->flags |= ANYOF_EOS;
+                            SET_SSC_EOS(data->start_class);
                    }
                 }
            }
@@ -4396,11 +4411,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 *minnextp += min;
 
                 if (f & SCF_DO_STCLASS_AND) {
-                    const int was = (data->start_class->flags & ANYOF_EOS);
+                    const int was = TEST_SSC_EOS(data.start_class);
 
                     cl_and(data->start_class, &intrnl);
                     if (was)
-                        data->start_class->flags |= ANYOF_EOS;
+                        SET_SSC_EOS(data->start_class);
                 }
                 if (data) {
                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -4603,7 +4618,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     StructCopy(&accum, data->start_class,
                                struct regnode_charclass_class);
                     flags |= SCF_DO_STCLASS_OR;
-                    data->start_class->flags |= ANYOF_EOS;
+                    SET_SSC_EOS(data->start_class);
                 }
             }
             scan= tail;
@@ -5194,7 +5209,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
        PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
        PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
-       PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
+
+        /* Cased is the same as Alpha in the ASCII range */
+       PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
+       PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
 
        PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
        PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
@@ -6149,11 +6167,11 @@ reStudy:
 
        if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
            && stclass_flag
-           && !(data.start_class->flags & ANYOF_EOS)
+           && ! TEST_SSC_EOS(data.start_class)
            && !cl_is_anything(data.start_class))
        {
            const U32 n = add_data(pRExC_state, 1, "f");
-           data.start_class->flags |= ANYOF_IS_SYNTHETIC;
+           OP(data.start_class) = ANYOF_SYNTHETIC;
 
            Newx(RExC_rxi->data->data[n], 1,
                struct regnode_charclass_class);
@@ -6221,11 +6239,11 @@ reStudy:
        r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
                = r->float_substr = r->float_utf8 = NULL;
 
-       if (!(data.start_class->flags & ANYOF_EOS)
+       if (! TEST_SSC_EOS(data.start_class)
            && !cl_is_anything(data.start_class))
        {
            const U32 n = add_data(pRExC_state, 1, "f");
-           data.start_class->flags |= ANYOF_IS_SYNTHETIC;
+           OP(data.start_class) = ANYOF_SYNTHETIC;
 
            Newx(RExC_rxi->data->data[n], 1,
                struct regnode_charclass_class);
@@ -11151,9 +11169,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
                            break;
                        case 'r':
                            if (memEQ(posixcc, "lowe", 4)) /* lower */
-                               namedclass = ANYOF_LOWER;
+                               namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
                            else if (memEQ(posixcc, "uppe", 4)) /* upper */
-                               namedclass = ANYOF_UPPER;
+                               namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
                            break;
                        case 't':
                            if (memEQ(posixcc, "digi", 4)) /* digit */
@@ -11448,7 +11466,11 @@ parseit:
                    if (UCHARAT(RExC_parse) == '^') {
                         RExC_parse++;
                         n--;
-                        value = value == 'p' ? 'P' : 'p'; /* toggle */
+                         /* toggle.  (The rhs xor gets the single bit that
+                          * differs between P and p; the other xor inverts just
+                          * that bit) */
+                         value ^= 'P' ^ 'p';
+
                         while (isSPACE(UCHARAT(RExC_parse))) {
                              RExC_parse++;
                              n--;
@@ -11684,16 +11706,6 @@ parseit:
                  * Unicode range swash fo this character class */
                 const char *Xname = swash_property_names[classnum];
 
-                /* LOWER and UPPER under fold match ALPHA in the ASCII range,
-                 * and Cased outside it */
-                if (FOLD && ! LOC
-                    && (classnum == _CC_LOWER || classnum == _CC_UPPER))
-                {
-                    ascii_source = PL_Posix_ptrs[_CC_ALPHA];
-                    l1_source = PL_L1Cased;
-                    Xname = "Cased";
-                }
-
                switch ((I32)namedclass) {
 
                case ANYOF_DIGIT:
@@ -11702,6 +11714,7 @@ parseit:
 
                case ANYOF_ALPHANUMERIC: /* C's alnum, in contrast to \w */
                case ANYOF_ALPHA:
+               case ANYOF_CASED:
                case ANYOF_GRAPH:
                case ANYOF_LOWER:
                case ANYOF_PRINT:
@@ -12245,6 +12258,7 @@ parseit:
 #endif
                     goto join_posix;
 
+                case ANYOF_NCASED:
                 case ANYOF_LOWER:
                 case ANYOF_NLOWER:
                 case ANYOF_UPPER:
@@ -12681,7 +12695,7 @@ parseit:
         }
 
         if (warn_super) {
-            ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+            OP(ret) = ANYOF_WARN_SUPER;
         }
     }
 
@@ -13572,10 +13586,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
     static const char * const anyofs[] = {
 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
-    || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
-    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_SPACE != 9 \
-    || _CC_BLANK != 10 || _CC_XDIGIT != 11 || _CC_PSXSPC != 12 \
-    || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
+    || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
+    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
+    || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
+    || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
+    || _CC_VERTSPACE != 16
   #error Need to adjust order of anyofs[]
 #endif
         "[\\w]",
@@ -13596,6 +13611,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
         "[:^alnum:]",
         "[:graph:]",
         "[:^graph:]",
+        "[:cased:]",
+        "[:^cased:]",
         "[\\s]",
         "[\\S]",
         "[:blank:]",