regexec.c: refactor and comment the CCC_TRY macros

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index be0feeb..1c49bb1 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -180,93 +180,87 @@
  #endif
  
  
-#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
-        case NAMEL:                                                         \
-            PL_reg_flags |= RF_tainted;                                     \
-            /* FALL THROUGH */                                              \
-        case NAME:                                                          \
-            if (!nextchr)                                                   \
-                sayNO;                                                      \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                \
-                    bool ok;                                                \
-                    ENTER;                                                  \
-                    save_re_context();                                      \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
-                    assert(ok);                                             \
-                    LEAVE;                                                  \
-                }                                                           \
-                if (!(OP(scan) == NAME                                      \
-                    ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                          \
-                {                                                           \
-                    sayNO;                                                  \
-                }                                                           \
-                locinput += PL_utf8skip[nextchr];                           \
-                nextchr = UCHARAT(locinput);                                \
-                break;                                                      \
-            }                                                               \
-           /* Drops through to the macro that calls this one */
-
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
-    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
-            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
-                sayNO;                                                      \
-            nextchr = UCHARAT(++locinput);                                  \
-            break
-
-/* Almost identical to the above, but has a case for a node that matches chars
- * between 128 and 255 using Unicode (latin1) semantics. */
-#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
-    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
-            if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
-                sayNO;                                                       \
-            nextchr = UCHARAT(++locinput);                                   \
-            break
-
-#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
-        case NAMEL:                                                          \
-            PL_reg_flags |= RF_tainted;                                      \
-            /* FALL THROUGH */                                               \
-        case NAME :                                                          \
-            if (!nextchr && locinput >= PL_regeol)                           \
-                sayNO;                                                       \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
-                if (!CAT2(PL_utf8_,CLASS)) {                                 \
-                    bool ok;                                                 \
-                    ENTER;                                                   \
-                    save_re_context();                                       \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
-                    assert(ok);                                              \
-                    LEAVE;                                                   \
-                }                                                            \
-                if ((OP(scan) == NAME                                        \
-                    ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                           \
-                {                                                            \
-                    sayNO;                                                   \
-                }                                                            \
-                locinput += PL_utf8skip[nextchr];                            \
-                nextchr = UCHARAT(locinput);                                 \
-                break;                                                       \
-            }
-
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
-    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
-            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
-                sayNO;                                                       \
-            nextchr = UCHARAT(++locinput);                                   \
-            break
-
-
-#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
-    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
-            if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
-                sayNO;                                                       \
-            nextchr = UCHARAT(++locinput);                                   \
-            break
-
-
+/* The actual code for CCC_TRY, which uses several variables from the routine
+ * it's callable from.  It is designed to be the bulk of a case statement.
+ * FUNC is the macro or function to call on non-utf8 targets that indicate if
+ *      nextchr matches the class.
+ * UTF8_TEST is the whole test string to use for utf8 targets
+ * LOAD is what to use to test, and if not present to load in the swash for the
+ *     class
+ * POS_OR_NEG is either empty or ! to complement the results of FUNC or
+ *     UTF8_TEST test.
+ * The logic is: Fail if we're at the end-of-string; otherwise if the target is
+ * utf8 and a variant, load the swash if necessary and test using the utf8
+ * test.  Advance to the next character if test is ok, otherwise fail; If not
+ * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
+ * fails, or advance to the next character */
+
+#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
+    if (locinput >= PL_regeol) {                                              \
+       sayNO;                                                                \
+    }                                                                         \
+    if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
+       LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
+       if (POS_OR_NEG (UTF8_TEST)) {                                         \
+           sayNO;                                                            \
+       }                                                                     \
+       locinput += PL_utf8skip[nextchr];                                     \
+       nextchr = UCHARAT(locinput);                                          \
+       break;                                                                \
+    }                                                                         \
+    if (POS_OR_NEG (FUNC(nextchr))) {                                         \
+       sayNO;                                                                \
+    }                                                                         \
+    nextchr = UCHARAT(++locinput);                                            \
+    break;
+
+/* Handle the non-locale cases for a character class and its complement.  It
+ * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
+ * This is because that code fails when the test succeeds, so we want to have
+ * the test fail so that the code succeeds.  The swash is stored in a
+ * predictable PL_ place */
+#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)                     \
+    case NAME:                                                                \
+       _CCC_TRY_CODE( !, FUNC,                                               \
+                         cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
+                                           (U8*)locinput, TRUE)),            \
+                         CLASS, STR)                                         \
+    case NNAME:                                                               \
+       _CCC_TRY_CODE(  , FUNC,                                               \
+                         cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
+                                           (U8*)locinput, TRUE)),            \
+                         CLASS, STR)                                         \
+
+/* Generate the case statements for both locale and non-locale character
+ * classes in regmatch for classes that don't have special unicode semantics.
+ * Locales don't use an immediate swash, but an intermediary special locale
+ * function that is called on the pointer to the current place in the input
+ * string.  That function will resolve to needing the same swash.  One might
+ * think that because we don't know what the locale will match, we shouldn't
+ * check with the swash loading function that it loaded properly; ie, that we
+ * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
+ * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
+ * irrelevant here */
+#define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
+               NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
+               CLASS, STR)                                                   \
+    case NAMEL:                                                               \
+       PL_reg_flags |= RF_tainted;                                           \
+       _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
+    case NNAMEL:                                                              \
+       PL_reg_flags |= RF_tainted;                                           \
+       _CCC_TRY_CODE(  , LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
+    /* Generate the non-locale cases */                                       \
+    _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
+
+/* This is like CCC_TRY, but has an extra set of parameters for generating case
+ * statements to handle separate Unicode semantics nodes */
+#define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
+                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
+                 NAMEU, NNAMEU, FUNCU,                                        \
+                 CLASS, STR)                                                  \
+    CCC_TRY(NAME, NNAME, FUNC, NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, CLASS, STR) \
+    _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
  
  /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
  
@@ -1557,13 +1551,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                  tmp = cBOOL((OP(c) == BOUNDL)
                              ? isALNUM_LC(tmp)
                              : (isWORDCHAR_L1(tmp)
-                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
+                               && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
                 REXEC_FBC_SCAN(
                     if (tmp ==
                          !((OP(c) == BOUNDL)
                            ? isALNUM_LC(*s)
                            : (isWORDCHAR_L1((U8) *s)
-                             && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                             && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
                     {
                         tmp = !tmp;
                         REXEC_FBC_TRYIT;
@@ -1600,13 +1594,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                  tmp = cBOOL((OP(c) == NBOUNDL)
                              ? isALNUM_LC(tmp)
                              : (isWORDCHAR_L1(tmp)
-                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
+                               && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
                 REXEC_FBC_SCAN(
                     if (tmp == ! cBOOL(
                              (OP(c) == NBOUNDL)
                              ? isALNUM_LC(*s)
                              : (isWORDCHAR_L1((U8) *s)
-                               && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                               && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
                      {
                         tmp = !tmp;
                      }
@@ -1616,44 +1610,68 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
                 goto got_it;
             break;
-       case ALNUM:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_PERL_WORD(),
-               swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-                (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
-           );
         case ALNUML:
             REXEC_FBC_CSCAN_TAINT(
                 isALNUM_LC_utf8((U8*)s),
                 isALNUM_LC(*s)
             );
+       case ALNUMU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+                isWORDCHAR_L1((U8) *s)
+           );
+       case ALNUM:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+                isWORDCHAR((U8) *s)
+           );
+       case NALNUMU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+                ! isWORDCHAR_L1((U8) *s)
+           );
         case NALNUM:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-                ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
+                ! isALNUM(*s)
             );
         case NALNUML:
             REXEC_FBC_CSCAN_TAINT(
                 !isALNUM_LC_utf8((U8*)s),
                 !isALNUM_LC(*s)
             );
+       case SPACEU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+               *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
+                isSPACE_L1((U8) *s)
+           );
         case SPACE:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
-                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
+                isSPACE((U8) *s)
             );
         case SPACEL:
             REXEC_FBC_CSCAN_TAINT(
                 isSPACE_LC_utf8((U8*)s),
                 isSPACE_LC(*s)
             );
+       case NSPACEU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+               !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
+                ! isSPACE_L1((U8) *s)
+           );
         case NSPACE:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
-                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
+                ! isSPACE((U8) *s)
             );
         case NSPACEL:
             REXEC_FBC_CSCAN_TAINT(
@@ -3651,10 +3669,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             else {
                 ln = (locinput != PL_bostr) ?
                     UCHARAT(locinput - 1) : '\n';
-               if (FLAGS(scan) & USE_UNI) {
+               if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
  
                      /* Here, can't be BOUNDL or NBOUNDL because they never set
-                     * the flags to USE_UNI */
+                     * the flags to REGEX_UNICODE_CHARSET */
                      ln = isWORDCHAR_L1(ln);
                      n = isWORDCHAR_L1(nextchr);
                  }
@@ -3696,14 +3714,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             }
             break;
         /* Special char classes - The defines start on line 129 or so */
-        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
-        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
+                 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
+                 ALNUMU, NALNUMU, isWORDCHAR_L1,
+                 perl_word, "a");
  
-        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
-        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
+                 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
+                 SPACEU, NSPACEU, isSPACE_L1,
+                 perl_space, " ");
  
-       CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
-       CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
+        CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
+               DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
+               posix_digit, "0");
  
         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
                        a Unicode extended Grapheme Cluster */
@@ -5932,8 +5955,9 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan++;
         }
         break;
-    case ALNUM:
+    case ALNUMU:
         if (utf8_target) {
+    utf8_wordchar:
             loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_ALNUM();
             while (hardcount < max && scan < loceol &&
@@ -5942,14 +5966,17 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
-        } else if (FLAGS(p) & USE_UNI) {
+        } else {
              while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
                  scan++;
              }
-       } else {
-            while (scan < loceol && isALNUM((U8) *scan)) {
-                scan++;
-            }
+       }
+       break;
+    case ALNUM:
+       if (utf8_target)
+           goto utf8_wordchar;
+       while (scan < loceol && isALNUM((U8) *scan)) {
+           scan++;
         }
         break;
      case ALNUML:
@@ -5966,24 +5993,30 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan++;
         }
         break;
-    case NALNUM:
+    case NALNUMU:
         if (utf8_target) {
+
+    utf8_Nwordchar:
+
             loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_ALNUM();
             while (hardcount < max && scan < loceol &&
-                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+                   ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
              {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
-        } else if (FLAGS(p) & USE_UNI) {
+        } else {
              while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
                  scan++;
              }
-       } else {
-            while (scan < loceol && ! isALNUM((U8) *scan)) {
-                scan++;
-            }
+       }
+       break;
+    case NALNUM:
+       if (utf8_target)
+           goto utf8_Nwordchar;
+       while (scan < loceol && ! isALNUM((U8) *scan)) {
+           scan++;
         }
         break;
      case NALNUML:
@@ -6000,8 +6033,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan++;
         }
         break;
-    case SPACE:
+    case SPACEU:
         if (utf8_target) {
+
+    utf8_space:
+
             loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_SPACE();
             while (hardcount < max && scan < loceol &&
@@ -6011,13 +6047,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
-        } else if (FLAGS(p) & USE_UNI) {
+           break;
+       }
+       else {
              while (scan < loceol && isSPACE_L1((U8) *scan)) {
                  scan++;
              }
-       } else {
-            while (scan < loceol && isSPACE((U8) *scan))
-                scan++;
+           break;
+       }
+    case SPACE:
+       if (utf8_target)
+           goto utf8_space;
+
+       while (scan < loceol && isSPACE((U8) *scan)) {
+           scan++;
         }
         break;
      case SPACEL:
@@ -6034,25 +6077,34 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan++;
         }
         break;
-    case NSPACE:
+    case NSPACEU:
         if (utf8_target) {
+
+    utf8_Nspace:
+
             loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_SPACE();
             while (hardcount < max && scan < loceol &&
-                  !(*scan == ' ' ||
-                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+                  ! (*scan == ' ' ||
+                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
              {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
-        } else if (FLAGS(p) & USE_UNI) {
+           break;
+       }
+       else {
              while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
                  scan++;
              }
-       } else {
-            while (scan < loceol && ! isSPACE((U8) *scan)) {
-                scan++;
-            }
+       }
+       break;
+    case NSPACE:
+       if (utf8_target)
+           goto utf8_Nspace;
+
+       while (scan < loceol && ! isSPACE((U8) *scan)) {
+           scan++;
         }
         break;
      case NSPACEL: