regcomp.c: Fix upper loop limit

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 86f6a29..a6090ed 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -794,7 +794,7 @@ S_cl_is_anything(const struct regnode_charclass_class *cl)
  
      PERL_ARGS_ASSERT_CL_IS_ANYTHING;
  
-    for (value = 0; value <= ANYOF_MAX; value += 2)
+    for (value = 0; value < ANYOF_MAX; value += 2)
         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
             return 1;
      if (!(cl->flags & ANYOF_UNICODE_ALL))
@@ -5218,6 +5218,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
  
         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
+       PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
  
         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
@@ -6798,11 +6799,11 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
         if (UTF)
             do {
                 RExC_parse += UTF8SKIP(RExC_parse);
-           } while (isALNUM_utf8((U8*)RExC_parse));
+           } while (isWORDCHAR_utf8((U8*)RExC_parse));
         else
             do {
                 RExC_parse++;
-           } while (isALNUM(*RExC_parse));
+           } while (isWORDCHAR(*RExC_parse));
      } else {
         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
          vFAIL("Group name must start with a non-digit word character");
@@ -11351,7 +11352,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         const char *s = RExC_parse;
         const char  c = *s++;
  
-       while (isALNUM(*s))
+       while (isWORDCHAR(*s))
             s++;
         if (*s && c == *s && s[1] == ']') {
             SAVEFREESV(RExC_rx_sv);
@@ -11626,7 +11627,7 @@ parseit:
                 }
             default:
                 /* Allow \_ to not give an error */
-               if (!SIZE_ONLY && isALNUM(value) && value != '_') {
+               if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
                     SAVEFREESV(RExC_rx_sv);
                     SAVEFREESV(listsv);
                     ckWARN2reg(RExC_parse,
@@ -11695,280 +11696,251 @@ parseit:
  
             if (! SIZE_ONLY) {
                  U8 classnum = namedclass_to_classnum(namedclass);
+                if (namedclass >= ANYOF_MAX) {  /* If a special class */
+                    if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
+
+                        /* Here, should be \h, \H, \v, or \V.  Neither /d nor
+                         * /l make a difference in what these match.  There
+                         * would be problems if these characters had folds
+                         * other than themselves, as cp_list is subject to
+                         * folding. */
+                        if (classnum != _CC_VERTSPACE) {
+                            assert(   namedclass == ANYOF_HORIZWS
+                                   || namedclass == ANYOF_NHORIZWS);
+
+                            /* It turns out that \h is just a synonym for
+                             * XPosixBlank */
+                            classnum = _CC_BLANK;
+                        }
  
-                /* The ascii range inversion list */
-                SV* ascii_source = PL_Posix_ptrs[classnum];
-
-                /* The full Latin1 range inversion list */
-                SV* l1_source = PL_L1Posix_ptrs[classnum];
-
-                /* The name of the property to use to match the full eXtended
-                 * Unicode range swash fo this character class */
-                const char *Xname = swash_property_names[classnum];
+                        _invlist_union_maybe_complement_2nd(
+                                cp_list,
+                                PL_XPosix_ptrs[classnum],
+                                namedclass % 2,  /* Complement if odd
+                                                    (NHORIZWS, NVERTWS) */
+                                &cp_list);
+                    }
+                }
+                else if (classnum == _CC_ASCII) {
+#ifdef HAS_ISASCII
+                    if (LOC) {
+                        ANYOF_CLASS_SET(ret, namedclass);
+                    }
+                    else
+#endif  /* Not isascii(); just use the hard-coded definition for it */
+                        _invlist_union_maybe_complement_2nd(
+                                posixes,
+                                PL_ASCII,
+                                namedclass % 2, /* Complement if odd (NASCII) */
+                                &posixes);
+                }
+                else {  /* Garden variety class */
+
+                    /* The ascii range inversion list */
+                    SV* ascii_source = PL_Posix_ptrs[classnum];
+
+                    /* The full Latin1 range inversion list */
+                    SV* l1_source = PL_L1Posix_ptrs[classnum];
+
+                    /* This code is structured into two major clauses.  The
+                     * first is for classes whose complete definitions may not
+                     * already be known.  It not, the Latin1 definition
+                     * (guaranteed to already known) is used plus code is
+                     * generated to load the rest at run-time (only if needed).
+                     * If the complete definition is known, it drops down to
+                     * the second clause, where the complete definition is
+                     * known */
+
+                    if (classnum < _FIRST_NON_SWASH_CC) {
+
+                        /* Here, the class has a swash, which may or not
+                         * already be loaded */
+
+                        /* The name of the property to use to match the full
+                         * eXtended Unicode range swash for this character
+                         * class */
+                        const char *Xname = swash_property_names[classnum];
+
+                        if ( !  PL_utf8_swash_ptrs[classnum]) {
+                            if (namedclass % 2 == 0) { /* A non-complemented
+                                                          class */
+                                /* If not /a matching, there are code points we
+                                 * don't know at compile time.  Arrange for the
+                                 * unknown matches to be loaded at run-time, if
+                                 * needed */
+                                if (! AT_LEAST_ASCII_RESTRICTED) {
+                                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
+                                                                 Xname);
+                                }
+                                if (LOC) {  /* Under locale, set run-time
+                                               lookup */
+                                    ANYOF_CLASS_SET(ret, namedclass);
+                                }
+                                else {
+                                    /* Add the current class's code points to
+                                     * the running total */
+                                    _invlist_union(posixes,
+                                                   (AT_LEAST_ASCII_RESTRICTED)
+                                                        ? ascii_source
+                                                        : l1_source,
+                                                   &posixes);
+                                }
+                            }
+                            else {  /* A complemented class */
+                                if (AT_LEAST_ASCII_RESTRICTED) {
+                                    /* Under /a should match everything above
+                                     * ASCII, plus the complement of the set's
+                                     * ASCII matches */
+                                    _invlist_union_complement_2nd(posixes,
+                                                                  ascii_source,
+                                                                  &posixes);
+                                }
+                                else {
+                                    /* Arrange for the unknown matches to be
+                                     * loaded at run-time, if needed */
+                                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
+                                                                 Xname);
+                                    runtime_posix_matches_above_Unicode = TRUE;
+                                    if (LOC) {
+                                        ANYOF_CLASS_SET(ret, namedclass);
+                                    }
+                                    else {
+
+                                        /* We want to match everything in
+                                         * Latin1, except those things that
+                                         * l1_source matches */
+                                        SV* scratch_list = NULL;
+                                        _invlist_subtract(PL_Latin1, l1_source,
+                                                          &scratch_list);
+
+                                        /* Add the list from this class to the
+                                         * running total */
+                                        if (! posixes) {
+                                            posixes = scratch_list;
+                                        }
+                                        else {
+                                            _invlist_union(posixes,
+                                                           scratch_list,
+                                                           &posixes);
+                                            SvREFCNT_dec_NN(scratch_list);
+                                        }
+                                        if (DEPENDS_SEMANTICS) {
+                                            ANYOF_FLAGS(ret)
+                                                  |= ANYOF_NON_UTF8_LATIN1_ALL;
+                                        }
+                                    }
+                                }
+                            }
+                            goto namedclass_done;
+                        }
  
-               switch ((I32)namedclass) {
+                        /* Here, there is a swash loaded for the class.  If no
+                         * inversion list for it yet, get it */
+                        if (! PL_XPosix_ptrs[classnum]) {
+                            PL_XPosix_ptrs[classnum]
+                             = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
+                        }
+                    }
  
-               case ANYOF_DIGIT:
-                    l1_source = ascii_source;
-                    /* FALL THROUGH */
+                    /* Here there is an inversion list already loaded for the
+                     * entire class */
  
-               case ANYOF_ALPHANUMERIC: /* C's alnum, in contrast to \w */
-               case ANYOF_ALPHA:
-               case ANYOF_CASED:
-               case ANYOF_GRAPH:
-               case ANYOF_LOWER:
-               case ANYOF_PRINT:
-               case ANYOF_PUNCT:
-               case ANYOF_UPPER:
-               case ANYOF_WORDCHAR:
-                    if ( !  PL_utf8_swash_ptrs[classnum]) {
-
-                        /* If not /a matching, there are code points we don't
-                         * know at compile time.  Arrange for the unknown
-                         * matches to be loaded at run-time, if needed */
-                        if (! AT_LEAST_ASCII_RESTRICTED) {
-                            Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", Xname);
-                        }
-                        if (LOC) {  /* Under locale, set run-time lookup */
-                            ANYOF_CLASS_SET(ret, namedclass);
-                        }
-                        else {
-                            /* Add the current class's code points to the
-                             * running total */
+                    if (namedclass % 2 == 0) {  /* A non-complemented class,
+                                                   like ANYOF_PUNCT */
+                        if (! LOC) {
+                            /* For non-locale, just add it to any existing list
+                             * */
                              _invlist_union(posixes,
                                             (AT_LEAST_ASCII_RESTRICTED)
-                                                ? ascii_source
-                                                : l1_source,
+                                               ? ascii_source
+                                               : PL_XPosix_ptrs[classnum],
                                             &posixes);
                          }
-                        break;
-                    }
-                    if (! PL_XPosix_ptrs[classnum]) {
-                        PL_XPosix_ptrs[classnum]
-                            = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
-                    }
-                    /* FALL THROUGH */
-
-               case ANYOF_BLANK:
-               case ANYOF_CNTRL:
-               case ANYOF_PSXSPC:
-               case ANYOF_SPACE:
-               case ANYOF_XDIGIT:
-                    if (! LOC) {
-                        /* For non-locale, just add it to any existing list */
-                        _invlist_union(posixes,
-                                       (AT_LEAST_ASCII_RESTRICTED)
-                                           ? ascii_source
-                                           : PL_XPosix_ptrs[classnum],
-                                       &posixes);
-                    }
-                    else {  /* Locale */
-                        SV* scratch_list = NULL;
+                        else {  /* Locale */
+                            SV* scratch_list = NULL;
  
-                        /* For above Latin1 code points, we use the full
-                         * Unicode range */
-                        _invlist_intersection(PL_AboveLatin1,
-                                              PL_XPosix_ptrs[classnum],
-                                              &scratch_list);
-                        /* And set the output to it, adding instead if there
-                         * already is an output.  Checking if 'posixes' is NULL
-                         * first saves an extra clone.  Its reference count
-                         * will be decremented at the next union, etc, or if
-                         * this is the only instance, at the end of the routine
-                         * */
-                        if (! posixes) {
-                            posixes = scratch_list;
-                        }
-                        else {
-                            _invlist_union(posixes, scratch_list, &posixes);
-                            SvREFCNT_dec_NN(scratch_list);
-                        }
+                            /* For above Latin1 code points, we use the full
+                             * Unicode range */
+                            _invlist_intersection(PL_AboveLatin1,
+                                                  PL_XPosix_ptrs[classnum],
+                                                  &scratch_list);
+                            /* And set the output to it, adding instead if
+                             * there already is an output.  Checking if
+                             * 'posixes' is NULL first saves an extra clone.
+                             * Its reference count will be decremented at the
+                             * next union, etc, or if this is the only
+                             * instance, at the end of the routine */
+                            if (! posixes) {
+                                posixes = scratch_list;
+                            }
+                            else {
+                                _invlist_union(posixes, scratch_list, &posixes);
+                                SvREFCNT_dec_NN(scratch_list);
+                            }
  
  #ifndef HAS_ISBLANK
-                        if (namedclass != ANYOF_BLANK) {
-#endif
-                            /* Set this class in the node for runtime
-                             * matching */
-                            ANYOF_CLASS_SET(ret, namedclass);
-#ifndef HAS_ISBLANK
-                        }
-                        else {
-                            /* No isblank(), use the hard-coded ASCII-range
-                             * blanks, adding them to the running total. */
-
-                            _invlist_union(posixes, ascii_source, &posixes);
-                        }
+                            if (namedclass != ANYOF_BLANK) {
  #endif
-                    }
-                   break;
-
-               case ANYOF_NDIGIT:
-                    l1_source = ascii_source;
-                    /* FALL THROUGH */
-
-               case ANYOF_NALPHANUMERIC:
-               case ANYOF_NALPHA:
-               case ANYOF_NGRAPH:
-               case ANYOF_NLOWER:
-               case ANYOF_NPRINT:
-               case ANYOF_NPUNCT:
-               case ANYOF_NUPPER:
-               case ANYOF_NWORDCHAR:
-                    if ( !  PL_utf8_swash_ptrs[classnum]) {
-                        if (AT_LEAST_ASCII_RESTRICTED) {
-                            /* Under /a should match everything above ASCII,
-                             * and the complement of the set's ASCII matches */
-                            _invlist_union_complement_2nd(posixes, ascii_source,
-                                                          &posixes);
-                        }
-                        else {
-                            /* Arrange for the unknown matches to be loaded at
-                             * run-time, if needed */
-                            Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", Xname);
-                            runtime_posix_matches_above_Unicode = TRUE;
-                            if (LOC) {
+                                /* Set this class in the node for runtime
+                                 * matching */
                                  ANYOF_CLASS_SET(ret, namedclass);
+#ifndef HAS_ISBLANK
                              }
                              else {
+                                /* No isblank(), use the hard-coded ASCII-range
+                                 * blanks, adding them to the running total. */
  
-                                /* We want to match everything in Latin1,
-                                 * except those things that l1_source matches
-                                 * */
-                                SV* scratch_list = NULL;
-                                _invlist_subtract(PL_Latin1, l1_source,
-                                                  &scratch_list);
-
-                                /* Add the list from this class to the running
-                                 * total */
-                                if (! posixes) {
-                                    posixes = scratch_list;
-                                }
-                                else {
-                                    _invlist_union(posixes, scratch_list,
-                                                   &posixes);
-                                    SvREFCNT_dec_NN(scratch_list);
-                                }
-                                if (DEPENDS_SEMANTICS) {
-                                    ANYOF_FLAGS(ret)
-                                                |= ANYOF_NON_UTF8_LATIN1_ALL;
-                                }
+                                _invlist_union(posixes, ascii_source, &posixes);
                              }
+#endif
                          }
-                        break;
                      }
-                    if (! PL_XPosix_ptrs[classnum]) {
-                        PL_XPosix_ptrs[classnum]
-                            = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
-                    }
-                    /* FALL THROUGH */
-
-               case ANYOF_NBLANK:
-               case ANYOF_NCNTRL:
-               case ANYOF_NPSXSPC:
-               case ANYOF_NSPACE:
-               case ANYOF_NXDIGIT:
-                    if (! LOC) {
-                        _invlist_union_complement_2nd(
+                    else {  /* A complemented class, like ANYOF_NPUNCT */
+                        if (! LOC) {
+                            _invlist_union_complement_2nd(
                                                  posixes,
                                                  (AT_LEAST_ASCII_RESTRICTED)
                                                      ? ascii_source
                                                      : PL_XPosix_ptrs[classnum],
                                                  &posixes);
-                        /* Under /d, everything in the upper half of the Latin1
-                         * range matches this complement */
-                        if (DEPENDS_SEMANTICS) {
-                            ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
-                        }
-                    }
-                    else {  /* Locale */
-                        SV* scratch_list = NULL;
-                        _invlist_subtract(PL_AboveLatin1,
-                                          PL_XPosix_ptrs[classnum],
-                                          &scratch_list);
-                        if (! posixes) {
-                            posixes = scratch_list;
-                        }
-                        else {
-                            _invlist_union(posixes, scratch_list, &posixes);
-                            SvREFCNT_dec_NN(scratch_list);
+                            /* Under /d, everything in the upper half of the
+                             * Latin1 range matches this complement */
+                            if (DEPENDS_SEMANTICS) {
+                                ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+                            }
                          }
+                        else {  /* Locale */
+                            SV* scratch_list = NULL;
+                            _invlist_subtract(PL_AboveLatin1,
+                                              PL_XPosix_ptrs[classnum],
+                                              &scratch_list);
+                            if (! posixes) {
+                                posixes = scratch_list;
+                            }
+                            else {
+                                _invlist_union(posixes, scratch_list, &posixes);
+                                SvREFCNT_dec_NN(scratch_list);
+                            }
  #ifndef HAS_ISBLANK
-                        if (namedclass != ANYOF_NBLANK) {
+                            if (namedclass != ANYOF_NBLANK) {
  #endif
-                            ANYOF_CLASS_SET(ret, namedclass);
+                                ANYOF_CLASS_SET(ret, namedclass);
  #ifndef HAS_ISBLANK
-                        }
-                        else {
-                            /* Get the list of all code points in Latin1 that
-                             * are not ASCII blanks, and add them to the
-                             * running total */
-                            _invlist_subtract(PL_Latin1, ascii_source,
-                                              &scratch_list);
-                            _invlist_union(posixes, scratch_list, &posixes);
-                            SvREFCNT_dec_NN(scratch_list);
-                        }
-#endif
-                    }
-                   break;
-
-               case ANYOF_ASCII:
-#ifdef HAS_ISASCII
-                   if (LOC) {
-                       ANYOF_CLASS_SET(ret, namedclass);
-                   }
-                    else
-#endif  /* Not isascii(); just use the hard-coded definition for it */
-                        _invlist_union(posixes, PL_ASCII, &posixes);
-                   break;
-               case ANYOF_NASCII:
-#ifdef HAS_ISASCII
-                   if (LOC) {
-                       ANYOF_CLASS_SET(ret, namedclass);
-                   }
-                    else {
+                            }
+                            else {
+                                /* Get the list of all code points in Latin1
+                                 * that are not ASCII blanks, and add them to
+                                 * the running total */
+                                _invlist_subtract(PL_Latin1, ascii_source,
+                                                  &scratch_list);
+                                _invlist_union(posixes, scratch_list, &posixes);
+                                SvREFCNT_dec_NN(scratch_list);
+                            }
  #endif
-                        _invlist_union_complement_2nd(posixes,
-                                                    PL_ASCII, &posixes);
-                        if (DEPENDS_SEMANTICS) {
-                            ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                          }
-#ifdef HAS_ISASCII
                      }
-#endif
-                   break;
-
-               case ANYOF_HORIZWS:
-                    /* For these, we use the cp_list, as neither /d nor /l make
-                     * a difference in what these match.  There would be
-                     * problems if these characters had folds other than
-                     * themselves, as cp_list is subject to folding.
-                     *
-                     * It turns out that \h is just a synonym for XPosixBlank */
-                    classnum = _CC_BLANK;
-                   /* FALL THROUGH */
-
-               case ANYOF_VERTWS:
-                   _invlist_union(cp_list, PL_XPosix_ptrs[classnum], &cp_list);
-                   break;
-
-               case ANYOF_NHORIZWS:
-                    classnum = _CC_BLANK;
-                   /* FALL THROUGH */
-
-               case ANYOF_NVERTWS:
-                    _invlist_union_complement_2nd(cp_list,
-                                                  PL_XPosix_ptrs[classnum],
-                                                  &cp_list);
-                   break;
-
-               case ANYOF_UNIPROP: /* this is to handle \p and \P */
-                   break;
-
-               default:
-                   vFAIL("Invalid [::] class");
-                   break;
-               }
-
+                }
+              namedclass_done:
                 continue;   /* Go get next character */
             }
         } /* end of namedclass \blah */