regcomp.c: Reorder two switch cases

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 9a38eb1..f38ed9c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -122,6 +122,12 @@ typedef struct scan_frame {
      struct scan_frame *next_frame;      /* next frame */
  } scan_frame;
  
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c)                                              \
+                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
+
  struct RExC_state_t {
      U32                flags;                  /* RXf_* are we folding, multilining? */
      U32                pm_flags;               /* PMf_* stuff from the calling PMOP */
@@ -641,6 +647,12 @@ static const scan_data_t zero_scan_data =
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
+#define        vWARN(loc, m) STMT_START {                                      \
+    const IV offset = loc - RExC_precomp;                              \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,       \
+           REPORT_LOCATION_ARGS(offset));              \
+} STMT_END
+
  #define        vWARN_dep(loc, m) STMT_START {                                  \
      const IV offset = loc - RExC_precomp;                              \
      __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,   \
@@ -880,8 +892,6 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
      PerlIO_printf(Perl_debug_log,"\n");                              \
  });
  
-#ifdef DEBUGGING
-
  /* is c a control character for which we have a mnemonic? */
  #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  
@@ -905,8 +915,6 @@ S_cntrl_to_mnemonic(const U8 c)
      return NULL;
  }
  
-#endif
-
  /* Mark that we cannot extend a found fixed substring at this point.
     Update the longest found anchored substring and the longest found
     floating substrings if needed. */
@@ -6871,7 +6879,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          SAVEFREEPV(RExC_recurse);
      }
  
-reStudy:
+  reStudy:
      r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
      DEBUG_r(
          RExC_study_chunk_recursed_count= 0;
@@ -10305,8 +10313,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      ret = reganode(pRExC_state,NGROUPP,num);
                      goto insert_if_check_paren;
                 }
-               else if (strnEQ(RExC_parse, "DEFINE",
-                                       MIN(DEFINE_len, RExC_end - RExC_parse)))
+               else if (RExC_end - RExC_parse >= DEFINE_len
+                        && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
                  {
                     ret = reganode(pRExC_state,DEFINEP,0);
                     RExC_parse += DEFINE_len;
@@ -11613,7 +11621,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
      PERL_ARGS_ASSERT_REGATOM;
  
-tryagain:
+  tryagain:
      switch ((U8)*RExC_parse) {
      case '^':
         RExC_seen_zerolen++;
@@ -11651,7 +11659,7 @@ tryagain:
                         FALSE, /* means parse the whole char class */
                         TRUE, /* allow multi-char folds */
                         FALSE, /* don't silence non-portable warnings. */
-                       RExC_strict,
+                       (bool) RExC_strict,
                         NULL);
         if (*RExC_parse != ']') {
             RExC_parse = oregcomp_parse;
@@ -11773,40 +11781,39 @@ tryagain:
              arg = ANYOF_WORDCHAR;
              goto join_posix;
  
-       case 'b':
+       case 'B':
             RExC_seen_zerolen++;
              RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = BOUND + get_regex_charset(RExC_flags);
-            if (op > BOUNDA) {  /* /aa is same as /a */
-                op = BOUNDA;
+           op = NBOUND + get_regex_charset(RExC_flags);
+            if (op > NBOUNDA) { /* /aa is same as /a */
+                op = NBOUNDA;
              }
-            else if (op == BOUNDL) {
+            else if (op == NBOUNDL) {
                  RExC_contains_locale = 1;
              }
             ret = reg_node(pRExC_state, op);
-           FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
             if ((U8) *(RExC_parse + 1) == '{') {
                  /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
+               vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
             }
             goto finish_meta_pat;
-       case 'B':
+
+       case 'b':
             RExC_seen_zerolen++;
              RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = NBOUND + get_regex_charset(RExC_flags);
-            if (op > NBOUNDA) { /* /aa is same as /a */
-                op = NBOUNDA;
+           op = BOUND + get_regex_charset(RExC_flags);
+            if (op > BOUNDA) {  /* /aa is same as /a */
+                op = BOUNDA;
              }
-            else if (op == NBOUNDL) {
+            else if (op == BOUNDL) {
                  RExC_contains_locale = 1;
              }
             ret = reg_node(pRExC_state, op);
-           FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
             if ((U8) *(RExC_parse + 1) == '{') {
                  /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
+               vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
             }
             goto finish_meta_pat;
  
@@ -11887,7 +11894,7 @@ tryagain:
                                 FALSE, /* don't silence non-portable warnings.
                                           It would be a bug if these returned
                                           non-portables */
-                               RExC_strict,
+                               (bool) RExC_strict,
                                 NULL);
                  /* regclass() can only return RESTART_UTF8 if multi-char folds
                     are allowed.  */
@@ -12262,7 +12269,7 @@ tryagain:
                                                        &result,
                                                        &error_msg,
                                                        PASS2, /* out warnings */
-                                                       RExC_strict,
+                                                       (bool) RExC_strict,
                                                         TRUE, /* Output warnings
                                                                  for non-
                                                                  portables */
@@ -12291,7 +12298,7 @@ tryagain:
                                                        &result,
                                                        &error_msg,
                                                        PASS2, /* out warnings */
-                                                       RExC_strict,
+                                                       (bool) RExC_strict,
                                                         TRUE, /* Silence warnings
                                                                  for non-
                                                                  portables */
@@ -13769,6 +13776,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                 separate for a while from the non-complemented
                                 versions because of complications with /d
                                 matching */
+    SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+                                  treated more simply than the general case,
+                                  leading to less compilation and execution
+                                  work */
      UV element_count = 0;   /* Number of distinct elements in the class.
                                Optimizations may be possible if this is tiny */
      AV * multi_char_matches = NULL; /* Code points that fold to more than one
@@ -13806,16 +13817,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * runtime locale is UTF-8 */
      SV* only_utf8_locale_list = NULL;
  
-#ifdef EBCDIC
-    /* In a range, counts how many 0-2 of the ends of it came from literals,
-     * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
-    UV literal_endpoint = 0;
+    /* In a range, if one of the endpoints is non-character-set portable,
+     * meaning that it hard-codes a code point that may mean a different
+     * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
+     * mnemonic '\t' which each mean the same character no matter which
+     * character set the platform is on. */
+    unsigned int non_portable_endpoint = 0;
  
      /* Is the range unicode? which means on a platform that isn't 1-1 native
       * to Unicode (i.e. non-ASCII), each code point in it should be considered
       * to be a Unicode value.  */
      bool unicode_range = FALSE;
-#endif
      bool invert = FALSE;    /* Is this class to be complemented */
  
      bool warn_super = ALWAYS_WARN_SUPER;
@@ -13919,9 +13931,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         if (!range) {
             rangebegin = RExC_parse;
             element_count++;
-#ifdef EBCDIC
-            literal_endpoint = 0;
-#endif
+            non_portable_endpoint = 0;
         }
         if (UTF) {
             value = utf8n_to_uvchr((U8*)RExC_parse,
@@ -13938,12 +13948,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          {
              namedclass = regpposixcc(pRExC_state, value, strict);
          }
-        else if (value != '\\') {
-#ifdef EBCDIC
-            literal_endpoint++;
-#endif
-        }
-        else {
+        else if (value == '\\') {
              /* Is a backslash; get the code point of the char after it */
             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
                 value = utf8n_to_uvchr((U8*)RExC_parse,
@@ -14025,13 +14030,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                          prevvalue = save_prevvalue;
                          continue;   /* Back to top of loop to get next char */
                      }
+
                      /* Here, is a single code point, and <value> contains it */
-#ifdef EBCDIC
-                    /* We consider named characters to be literal characters,
-                     * and they are Unicode */
-                    literal_endpoint++;
-                    unicode_range = TRUE;
-#endif
+                    unicode_range = TRUE;   /* \N{} are Unicode */
                  }
                  break;
             case 'p':
@@ -14229,6 +14230,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         vFAIL(error_msg);
                     }
                 }
+                non_portable_endpoint++;
                 if (IN_ENCODING && value < 0x100) {
                     goto recode_encoding;
                 }
@@ -14248,11 +14250,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         vFAIL(error_msg);
                     }
                 }
+                non_portable_endpoint++;
                 if (IN_ENCODING && value < 0x100)
                     goto recode_encoding;
                 break;
             case 'c':
                 value = grok_bslash_c(*RExC_parse++, PASS2);
+                non_portable_endpoint++;
                 break;
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7':
@@ -14280,6 +14284,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                              (void)ReREFCNT_inc(RExC_rx_sv);
                          }
                      }
+                    non_portable_endpoint++;
                     if (IN_ENCODING && value < 0x100)
                         goto recode_encoding;
                     break;
@@ -14458,15 +14463,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                  &cp_list);
                      }
                  }
-                else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
+                else if (UNI_SEMANTICS
+                        || classnum == _CC_ASCII
+                        || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+                                                  || classnum == _CC_XDIGIT)))
+                {
+                    /* We usually have to worry about /d and /a affecting what
+                     * POSIX classes match, with special code needed for /d
+                     * because we won't know until runtime what all matches.
+                     * But there is no extra work needed under /u, and
+                     * [:ascii:] is unaffected by /a and /d; and :digit: and
+                     * :xdigit: don't have runtime differences under /d.  So we
+                     * can special case these, and avoid some extra work below,
+                     * and at runtime. */
+                    _invlist_union_maybe_complement_2nd(
+                                                     simple_posixes,
+                                                     PL_XPosix_ptrs[classnum],
+                                                     namedclass % 2 != 0,
+                                                     &simple_posixes);
+                }
+                else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
                             complement and use nposixes */
                      SV** posixes_ptr = namedclass % 2 == 0
                                         ? &posixes
                                         : &nposixes;
-                    SV** source_ptr = &PL_XPosix_ptrs[classnum];
                      _invlist_union_maybe_complement_2nd(
                                                       *posixes_ptr,
-                                                     *source_ptr,
+                                                     PL_XPosix_ptrs[classnum],
                                                       namedclass % 2 != 0,
                                                       posixes_ptr);
                  }
@@ -14490,8 +14513,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         if (range) {
  #ifdef EBCDIC
              /* For unicode ranges, we have to test that the Unicode as opposed
-             * to the native values are not decreasing.  (Above 255, and there
-             * is no difference between native and Unicode) */
+             * to the native values are not decreasing.  (Above 255, there is
+             * no difference between native and Unicode) */
             if (unicode_range && prevvalue < 255 && value < 255) {
                  if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
                      goto backwards_range;
@@ -14633,8 +14656,100 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              }
          }
  
+        if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+            if (range) {
+
+                /* If the range starts above 255, everything is portable and
+                 * likely to be so for any forseeable character set, so don't
+                 * warn. */
+                if (unicode_range && non_portable_endpoint && prevvalue < 256) {
+                    vWARN(RExC_parse, "Both or neither range ends should be Unicode");
+                }
+                else if (prevvalue != value) {
+
+                    /* Under strict, ranges that stop and/or end in an ASCII
+                     * printable should have each end point be a portable value
+                     * for it (preferably like 'A', but we don't warn if it is
+                     * a (portable) Unicode name or code point), and the range
+                     * must be be all digits or all letters of the same case.
+                     * Otherwise, the range is non-portable and unclear as to
+                     * what it contains */
+                    if ((isPRINT_A(prevvalue) || isPRINT_A(value))
+                        && (non_portable_endpoint
+                            || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
+                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
+                                   || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
+                    {
+                        vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+                    }
+                    else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+
+                        /* But the nature of Unicode and languages mean we
+                         * can't do the same checks for above-ASCII ranges,
+                         * except in the case of digit ones.  These should
+                         * contain only digits from the same group of 10.  The
+                         * ASCII case is handled just above.  0x660 is the
+                         * first digit character beyond ASCII.  Hence here, the
+                         * range could be a range of digits.  Find out.  */
+                        IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                         prevvalue);
+                        IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                         value);
+
+                        /* If the range start and final points are in the same
+                         * inversion list element, it means that either both
+                         * are not digits, or both are digits in a consecutive
+                         * sequence of digits.  (So far, Unicode has kept all
+                         * such sequences as distinct groups of 10, but assert
+                         * to make sure).  If the end points are not in the
+                         * same element, neither should be a digit. */
+                        if (index_start == index_final) {
+                            assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+                            || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+                            - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+                            == 10);
+                        }
+                        else if ((index_start >= 0
+                                  && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
+                                 || (index_final >= 0
+                                     && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+                        {
+                            vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
+                        }
+                    }
+                }
+            }
+            if ((! range || prevvalue == value) && non_portable_endpoint) {
+                if (isPRINT_A(value)) {
+                    char literal[3];
+                    unsigned d = 0;
+                    if (isBACKSLASHED_PUNCT(value)) {
+                        literal[d++] = '\\';
+                    }
+                    literal[d++] = (char) value;
+                    literal[d++] = '\0';
+
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
+                           (int) (RExC_parse - rangebegin),
+                           rangebegin,
+                           literal
+                        );
+                }
+                else if isMNEMONIC_CNTRL(value) {
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
+                           (int) (RExC_parse - rangebegin),
+                           rangebegin,
+                           cntrl_to_mnemonic((char) value)
+                        );
+                }
+            }
+        }
+
          /* Deal with this element of the class */
         if (! SIZE_ONLY) {
+
  #ifndef EBCDIC
              cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
                                                       prevvalue, value);
@@ -14645,7 +14760,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              if ((UNLIKELY(prevvalue == 0) && value >= 255)
                  || ! (prevvalue < 256
                        && (unicode_range
-                          || (literal_endpoint == 2
+                          || (! non_portable_endpoint
                                && ((isLOWER_A(prevvalue) && isLOWER_A(value))
                                    || (isUPPER_A(prevvalue)
                                        && isUPPER_A(value)))))))
@@ -14873,24 +14988,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      op = POSIXA;
                  }
              }
-            else if (prevvalue == 'A') {
-                if (value == 'Z'
+            else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+                /* We can optimize A-Z or a-z, but not if they could match
+                 * something like the KELVIN SIGN under /i (/a means they
+                 * can't) */
+                if (prevvalue == 'A') {
+                    if (value == 'Z'
  #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && ! non_portable_end_point
  #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+                        op = POSIXA;
+                    }
                  }
-            }
-            else if (prevvalue == 'a') {
-                if (value == 'z'
+                else if (prevvalue == 'a') {
+                    if (value == 'z'
  #ifdef EBCDIC
-                    && literal_endpoint == 2
+                        && ! non_portable_end_point
  #endif
-                ) {
-                    arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
-                    op = POSIXA;
+                    ) {
+                        arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+                        op = POSIXA;
+                    }
                  }
              }
          }
@@ -14944,6 +15064,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
              SvREFCNT_dec(posixes);
              SvREFCNT_dec(nposixes);
+            SvREFCNT_dec(simple_posixes);
              SvREFCNT_dec(cp_list);
              SvREFCNT_dec(cp_foldable_list);
              return ret;
@@ -15101,6 +15222,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * classes.  The lists are kept separate up to now because we don't want to
       * fold the classes (folding of those is automatically handled by the swash
       * fetching code) */
+    if (simple_posixes) {
+        _invlist_union(cp_list, simple_posixes, &cp_list);
+        SvREFCNT_dec_NN(simple_posixes);
+    }
      if (posixes || nposixes) {
          if (posixes && AT_LEAST_ASCII_RESTRICTED) {
              /* Under /a and /aa, nothing above ASCII matches these */
@@ -16485,10 +16610,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                  sv_catpvs(sv, "{non-utf8-latin1-all}");
              }
  
+            if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
+                sv_catpvs(sv, "{above_bitmap_all}");
+
              if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
                  SV *lv; /* Set if there is something outside the bit map. */
-                bool byte_output = FALSE;   /* If something in the bitmap has
-                                               been output */
+                bool byte_output = FALSE;   /* If something has been output */
                  SV *only_utf8_locale;
  
                  /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
@@ -16569,9 +16696,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      invlist_iterfinish(only_utf8_locale);
                  }
              }
-
-            if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
-                sv_catpvs(sv, "{above_bitmap_all}");
         }
          SvREFCNT_dec(bitmap_invlist);
  
@@ -17172,10 +17296,6 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
  }
  
  #ifdef DEBUGGING
-/* Certain characters are output as a sequence with the first being a
- * backslash. */
-#define isBACKSLASHED_PUNCT(c)                                              \
-                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
  
  STATIC void
  S_put_code_point(pTHX_ SV *sv, UV c)