This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec:c Remove unreached code
[perl5.git] / regexec.c
index ff76c84..eba8c81 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1645,7 +1645,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
-               *s == ' ' || isSPACE_LC_utf8((U8*)s),
+               isSPACE_LC_utf8((U8*)s),
                isSPACE_LC(*s)
            );
        case NSPACE:
@@ -1656,7 +1656,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
-               !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
+               !isSPACE_LC_utf8((U8*)s),
                !isSPACE_LC(*s)
            );
        case DIGIT:
@@ -3493,7 +3493,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    ST.nextword,
                    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
                            PL_colors[0], PL_colors[1],
-                           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
+                           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
                        ) 
                    : "not compiled under -Dr",
                    PL_colors[5] );
@@ -3927,31 +3927,74 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            break;
             
        case NREFFL:
-       {
+       {   /* The capture buffer cases.  The ones beginning with N for the
+              named buffers just convert to the equivalent numbered and
+              pretend they were called as the corresponding numbered buffer
+              op.  */
+           /* don't initialize these, it makes C++ unhappy */
            char *s;
            char type;
+           re_fold_t folder;
+           const U8 *fold_array;
+
            PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NREF:
+           folder = foldEQ_locale;
+           fold_array = PL_fold_locale;
+           type = REFFL;
+           goto do_nref;
+
+       case NREFFU:
+           folder = foldEQ_latin1;
+           fold_array = PL_fold_latin1;
+           type = REFFU;
+           goto do_nref;
+
        case NREFF:
-           type = OP(scan);
+           folder = foldEQ;
+           fold_array = PL_fold;
+           type = REFF;
+           goto do_nref;
+
+       case NREF:
+           type = REF;
+           folder = NULL;
+           fold_array = NULL;
+         do_nref:
+
+           /* For the named back references, find the corresponding buffer
+            * number */
            n = reg_check_named_buff_matched(rex,scan);
 
-            if ( n ) {
-                type = REF + ( type - NREF );
-                goto do_ref;
-            } else {
+            if ( ! n ) {
                 sayNO;
-            }
-            /* unreached */
+           }
+           goto do_nref_ref_common;
+
        case REFFL:
            PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
+           folder = foldEQ_locale;
+           fold_array = PL_fold_locale;
+           goto do_ref;
+
+       case REFFU:
+           folder = foldEQ_latin1;
+           fold_array = PL_fold_latin1;
+           goto do_ref;
+
+       case REFF:
+           folder = foldEQ;
+           fold_array = PL_fold;
+           goto do_ref;
+
         case REF:
-       case REFF: 
-           n = ARG(scan);  /* which paren pair */
+           folder = NULL;
+           fold_array = NULL;
+
+         do_ref:
            type = OP(scan);
-         do_ref:  
+           n = ARG(scan);  /* which paren pair */
+
+         do_nref_ref_common:
            ln = PL_regoffs[n].start;
            PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
            if (*PL_reglastparen < n || ln == -1)
@@ -3960,49 +4003,40 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                break;
 
            s = PL_bostr + ln;
-           if (utf8_target && type != REF) {   /* REF can do byte comparison */
-               char *l = locinput;
-               const char *e = PL_bostr + PL_regoffs[n].end;
-               /*
-                * Note that we can't do the "other character" lookup trick as
-                * in the 8-bit case (no pun intended) because in Unicode we
-                * have to map both upper and title case to lower case.
-                */
-               if (type == REFF) {
-                   while (s < e) {
-                       STRLEN ulen1, ulen2;
-                       U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
-                       U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
-
-                       if (l >= PL_regeol)
-                           sayNO;
-                       toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
-                       toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
-                       if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
-                           sayNO;
-                       s += ulen1;
-                       l += ulen2;
-                   }
+           if (type != REF     /* REF can do byte comparison */
+               && (utf8_target
+                    || (type == REFFU
+                        && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
+                            || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
+           { /* XXX handle REFFL better */
+               char * limit = PL_regeol;
+
+               /* This call case insensitively compares the entire buffer
+                   * at s, with the current input starting at locinput, but
+                   * not going off the end given by PL_regeol, and returns in
+                   * limit upon success, how much of the current input was
+                   * matched */
+               if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
+                                   locinput, &limit, 0, utf8_target))
+               {
+                   sayNO;
                }
-               locinput = l;
+               locinput = limit;
                nextchr = UCHARAT(locinput);
                break;
            }
 
-           /* Inline the first character, for speed. */
+           /* Not utf8:  Inline the first character, for speed. */
            if (UCHARAT(s) != nextchr &&
                (type == REF ||
-                (UCHARAT(s) != (type == REFF
-                                 ? PL_fold : PL_fold_locale)[nextchr])))
+                UCHARAT(s) != fold_array[nextchr]))
                sayNO;
            ln = PL_regoffs[n].end - ln;
            if (locinput + ln > PL_regeol)
                sayNO;
            if (ln > 1 && (type == REF
                           ? memNE(s, locinput, ln)
-                          : (type == REFF
-                             ? ! foldEQ(s, locinput, ln)
-                             : ! foldEQ_locale(s, locinput, ln))))
+                          : ! folder(s, locinput, ln)))
                sayNO;
            locinput += ln;
            nextchr = UCHARAT(locinput);
@@ -5511,7 +5545,7 @@ NULL
             n = ARG(scan);
             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
                 locinput += ln;
-            } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
+            } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
                 sayNO;
             } else  {
                 U8 folded[UTF8_MAXBYTES_CASE+1];
@@ -5855,7 +5889,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            char *tmpeol = loceol;
            while (hardcount < max
                    && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
-                                   STRING(p), NULL, 1, UTF_PATTERN))
+                                  STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
            {
                scan = tmpeol;
                tmpeol = loceol;
@@ -5998,7 +6032,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
        if (utf8_target) {
            loceol = PL_regeol;
            while (hardcount < max && scan < loceol &&
-                  (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+                  isSPACE_LC_utf8((U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -6033,7 +6067,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
        if (utf8_target) {
            loceol = PL_regeol;
            while (hardcount < max && scan < loceol &&
-                  !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+                  !isSPACE_LC_utf8((U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -6056,6 +6090,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan++;
        }
        break;
+    case DIGITL:
+       PL_reg_flags |= RF_tainted;
+       if (utf8_target) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol &&
+                  isDIGIT_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isDIGIT_LC(*scan))
+               scan++;
+       }
+       break;
     case NDIGIT:
        if (utf8_target) {
            loceol = PL_regeol;
@@ -6069,6 +6117,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            while (scan < loceol && !isDIGIT(*scan))
                scan++;
        }
+    case NDIGITL:
+       PL_reg_flags |= RF_tainted;
+       if (utf8_target) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol &&
+                  !isDIGIT_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isDIGIT_LC(*scan))
+               scan++;
+       }
+       break;
     case LNBREAK:
         if (utf8_target) {
            loceol = PL_regeol;
@@ -6277,23 +6339,17 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
     if (c < 256) {
        if (ANYOF_BITMAP_TEST(n, c))
            match = TRUE;
-       else if (flags & ANYOF_FOLD) {
-           U8 f;
 
-           if (flags & ANYOF_LOCALE) {
-               PL_reg_flags |= RF_tainted;
-               f = PL_fold_locale[c];
-           }
-           else
-               f = PL_fold[c];
-           if (f != c && ANYOF_BITMAP_TEST(n, f))
-               match = TRUE;
-       }
-       
-       if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
+       else if (flags & ANYOF_LOCALE) {
            PL_reg_flags |= RF_tainted;
-           if (
-               (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
+
+           if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
+                && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
+           {
+               match = TRUE;
+           }
+           else if (ANYOF_CLASS_TEST_ANY_SET(n)
+           && ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
                (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
                (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
                (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
@@ -6324,7 +6380,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
                (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
                ) /* How's that for a conditional? */
-           {
+           {
                match = TRUE;
            }
        }
@@ -6335,8 +6391,9 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
     if (!match) {
        if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
            if (c >= 256
-               || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold
-                                           should match */
+               || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a
+                                                         non-Latin1 fold
+                                                         should match */
                    && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
            {
                match = TRUE;
@@ -6358,7 +6415,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                }
                if (swash_fetch(sw, utf8_p, 1))
                    match = TRUE;
-               else if (flags & ANYOF_FOLD) {
+               else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
                    if (!match && lenp && av) {
                        I32 i;
                        for (i = 0; i <= av_len(av); i++) {
@@ -6372,104 +6429,77 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                            }
                        }
                    }
-                   if (!match) {
+                   if (!match) { /* See if the folded version matches */
                        U8 folded[UTF8_MAXBYTES_CASE+1];
-
-                       /* See if the folded version matches */
+                       SV** listp;
                        STRLEN foldlen;
+
                        to_utf8_fold(utf8_p, folded, &foldlen);
-                       if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
-                           match = TRUE;
-                       }
-                       else {
-                           /* The fold in a few cases  of an above Latin1 char
-                            * is in the Latin1 range, and hence may be in the
-                            * bitmap */
-                           if (UTF8_IS_INVARIANT(*folded)
-                               && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
-                           {
-                               match = TRUE;
-                           }
-                           else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
-                                    && ANYOF_BITMAP_TEST(n,
-                                         UNI_TO_NATIVE(
-                                            TWO_BYTE_UTF8_TO_UNI(folded[0],
-                                                                  folded[1]))))
-                           { /* Since the fold comes from internally
-                              * generated data, we can safely assume it is
-                              * valid utf8 in the test above */
 
-                               match = TRUE;
+                       /* Consider "k" =~ /[K]/i.  The line above would have
+                        * just folded the 'k' to itself, and that isn't going
+                        * to match 'K'.  So we look through the closure of
+                        * everything that folds to 'k'.  That will find the
+                        * 'K'.  Initialize the list, if necessary */
+                       if (! PL_utf8_foldclosures) {
+
+                           /* If the folds haven't been read in, call a fold
+                            * function to force that */
+                           if (! PL_utf8_tofold) {
+                               U8 dummy[UTF8_MAXBYTES+1];
+                               STRLEN dummy_len;
+                               to_utf8_fold((U8*) "A", dummy, &dummy_len);
                            }
-                            if (! match) {
-                               SV** listp;
-
-                               /* Consider "k" =~ /[K]/i.  The line above
-                                * would have just folded the 'k' to itself,
-                                * and that isn't going to match 'K'.  So we
-                                * look through the closure of everything that
-                                * folds to 'k'.  That will find the 'K'.
-                                * Initialize the list, if necessary */
-                               if (! PL_utf8_foldclosures) {
-
-                                   /* If the folds haven't been read in, call a
-                                   * fold function to force that */
-                                   if (! PL_utf8_tofold) {
-                                       U8 dummy[UTF8_MAXBYTES+1];
-                                       STRLEN dummy_len;
-                                       to_utf8_fold((U8*) "A",
-                                                           dummy, &dummy_len);
-                                   }
-                                   PL_utf8_foldclosures =
-                                         _swash_inversion_hash(PL_utf8_tofold);
-                               }
+                           PL_utf8_foldclosures =
+                                 _swash_inversion_hash(PL_utf8_tofold);
+                       }
 
-                               /* The data structure is a hash with the keys
-                                * every character that is folded to, like 'k',
-                                * and the values each an array of everything
-                                * that folds to its key.  e.g. [ 'k', 'K',
-                                * KELVIN_SIGN ] */
-                               if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                             (char *) folded, foldlen, FALSE)))
+                       /* The data structure is a hash with the keys every
+                        * character that is folded to, like 'k', and the
+                        * values each an array of everything that folds to its
+                        * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                       if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                     (char *) folded, foldlen, FALSE)))
+                       {
+                           AV* list = (AV*) *listp;
+                           IV i;
+                           for (i = 0; i <= av_len(list); i++) {
+                               SV** try_p = av_fetch(list, i, FALSE);
+                               char* try_c;
+                               if (try_p == NULL) {
+                                   Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                               }
+                               /* Don't have to worry about embeded nulls
+                                * since NULL isn't folded or foldable */
+                               try_c = SvPVX(*try_p);
+
+                               /* The fold in a few cases  of an above Latin1
+                                * char is in the Latin1 range, and hence may
+                                * be in the bitmap */
+                               if (UTF8_IS_INVARIANT(*try_c)
+                                   && ANYOF_BITMAP_TEST(n,
+                                                   UNI_TO_NATIVE(*try_c)))
                                {
-                                   AV* list = (AV*) *listp;
-                                   IV i;
-                                   for (i = 0; i <= av_len(list); i++) {
-                                       SV** try_p = av_fetch(list, i, FALSE);
-                                       char* try_c;
-                                       if (try_p == NULL) {
-                                           Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                                       }
-                                       /* Don't have to worry about embeded
-                                        * nulls since NULL isn't folded or
-                                        * foldable */
-                                       try_c = SvPVX(*try_p);
-                                       if (UTF8_IS_INVARIANT(*try_c)
-                                           && ANYOF_BITMAP_TEST(n,
-                                                           UNI_TO_NATIVE(*try_c)))
-                                       {
-                                           match = TRUE;
-                                           break;
-                                       }
-                                       else if
-                                           (UTF8_IS_DOWNGRADEABLE_START(*try_c)
-                                            && ANYOF_BITMAP_TEST(n,
-                                            UNI_TO_NATIVE(
+                                   match = TRUE;
+                                   break;
+                               }
+                               else if
+                                   (UTF8_IS_DOWNGRADEABLE_START(*try_c)
+                                    && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
                                                TWO_BYTE_UTF8_TO_UNI(try_c[0],
-                                                                    try_c[1]))))
-                                       {
-                                           match = TRUE;
-                                           break;
-                                       } else if (swash_fetch(sw,
-                                                               (U8*) try_c, 1))
-                                       {
-                                           match = TRUE;
-                                           break;
-                                       }
-                                   }
+                                                                   try_c[1]))))
+                               {
+                                  /* Since the fold comes from internally
+                                   * generated data, we can safely assume it
+                                   * is valid utf8 in the test above */
+                                   match = TRUE;
+                                   break;
+                               } else if (swash_fetch(sw, (U8*) try_c, 1)) {
+                                   match = TRUE;
+                                   break;
                                }
                            }
-                        }
+                       }
                    }
                }