Optimize /[a-z]/ and /[A-Z]/

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 94dfd55..236cb24 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2007,6 +2007,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
      });
  
      re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+    assert(re_trie_maxbuff);
      if (!SvIOK(re_trie_maxbuff)) {
          sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
      }
@@ -4389,7 +4390,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (flags & SCF_DO_SUBSTR)
                     data->pos_min++;
                 min++;
-               /* Fall through. */
+               /* FALLTHROUGH */
             case STAR:
                 if (flags & SCF_DO_STCLASS) {
                     mincount = 0;
@@ -4861,7 +4862,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
  
                 case NPOSIXL:
                      invert = 1;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
  
                 case POSIXL:
                      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
@@ -4902,7 +4903,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                  case NPOSIXA:   /* For these, we always know the exact set of
                                     what's matched */
                      invert = 1;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
                 case POSIXA:
                      if (FLAGS(scan) == _CC_ASCII) {
                          my_invlist = PL_XPosix_ptrs[_CC_ASCII];
@@ -4917,7 +4918,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                 case NPOSIXD:
                 case NPOSIXU:
                      invert = 1;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
                 case POSIXD:
                 case POSIXU:
                      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
@@ -9411,8 +9412,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
              int internal_argval = 0; /* internal_argval is only useful if
                                          !argok */
  
-            if (has_intervening_patws && SIZE_ONLY) {
-                ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+            if (has_intervening_patws) {
+                RExC_parse++;
+                vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
              }
             while ( *RExC_parse && *RExC_parse != ')' ) {
                 if ( *RExC_parse == ':' ) {
@@ -9518,8 +9520,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
          else if (*RExC_parse == '?') { /* (?...) */
             bool is_logical = 0;
             const char * const seqstart = RExC_parse;
-            if (has_intervening_patws && SIZE_ONLY) {
-                ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+            if (has_intervening_patws) {
+                RExC_parse++;
+                vFAIL("In '(?...)', the '(' and '?' must be adjacent");
              }
  
             RExC_parse++;
@@ -9734,7 +9737,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      RExC_parse--; /* rewind to let it be handled later */
                      goto parse_flags;
                  }
-                /*FALLTHROUGH */
+                /* FALLTHROUGH */
              case '1': case '2': case '3': case '4': /* (?1) */
             case '5': case '6': case '7': case '8': case '9':
                 RExC_parse--;
@@ -9806,7 +9809,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 }
                 *flagp |= POSTPONED;
                 paren = *RExC_parse++;
-               /* FALL THROUGH */
+               /* FALLTHROUGH */
             case '{':           /* (?{...}) */
             {
                 U32 n = 0;
@@ -10151,7 +10154,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         case '=':
         case '!':
             *flagp &= ~HASWIDTH;
-           /* FALL THROUGH */
+           /* FALLTHROUGH */
         case '>':
             ender = reg_node(pRExC_state, SUCCEED);
             break;
@@ -10673,7 +10676,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
       * modifier.  The other meaning does not, so use a temporary until we find
       * out which we are being called with */
      p = (RExC_flags & RXf_PMf_EXTENDED)
-       ? regwhite( pRExC_state, RExC_parse )
+       ? regpatws(pRExC_state, RExC_parse,
+                                TRUE) /* means recognize comments */
         : RExC_parse;
  
      /* Disambiguate between \N meaning a named character versus \N meaning
@@ -11264,7 +11268,7 @@ tryagain:
             RExC_parse++;
             goto defchar;
         }
-       /* FALL THROUGH */
+       /* FALLTHROUGH */
      case '?':
      case '+':
      case '*':
@@ -11427,7 +11431,7 @@ tryagain:
              }
  
             *flagp |= HASWIDTH|SIMPLE;
-            /* FALL THROUGH */
+            /* FALLTHROUGH */
  
           finish_meta_pat:
             nextchar(pRExC_state);
@@ -11626,7 +11630,7 @@ tryagain:
         case '\0':
             if (RExC_parse >= RExC_end)
                 FAIL("Trailing \\");
-           /* FALL THROUGH */
+           /* FALLTHROUGH */
         default:
             /* Do not generate "unrecognized" warnings here, we fall
                back into the quick-grab loop below */
@@ -11637,10 +11641,11 @@ tryagain:
  
      case '#':
         if (RExC_flags & RXf_PMf_EXTENDED) {
-           if ( reg_skipcomment( pRExC_state ) )
+           RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+           if (RExC_parse < RExC_end)
                 goto tryagain;
         }
-       /* FALL THROUGH */
+       /* FALLTHROUGH */
  
      default:
  
@@ -11718,7 +11723,8 @@ tryagain:
                 oldp = p;
  
                 if (RExC_flags & RXf_PMf_EXTENDED)
-                   p = regwhite( pRExC_state, p );
+                    p = regpatws(pRExC_state, p,
+                                          TRUE); /* means recognize comments */
                 switch ((U8)*p) {
                 case '^':
                 case '$':
@@ -11932,7 +11938,7 @@ tryagain:
                     case '\0':
                         if (p >= RExC_end)
                             FAIL("Trailing \\");
-                       /* FALL THROUGH */
+                       /* FALLTHROUGH */
                     default:
                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
                             /* Include any { following the alpha to emphasize
@@ -11946,15 +11952,6 @@ tryagain:
                     break;
                 default:    /* A literal character */
  
-                    if (! SIZE_ONLY
-                        && RExC_flags & RXf_PMf_EXTENDED
-                        && ckWARN_d(WARN_DEPRECATED)
-                        && is_PATWS_non_low_safe(p, RExC_end, UTF))
-                    {
-                        vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
-                                "Escape literal pattern white space under /x");
-                    }
-
                   normal_default:
                     if (UTF8_IS_START(*p) && UTF) {
                         STRLEN numlen;
@@ -11972,7 +11969,8 @@ tryagain:
                  */
  
                 if ( RExC_flags & RXf_PMf_EXTENDED)
-                   p = regwhite( pRExC_state, p );
+                    p = regpatws(pRExC_state, p,
+                                          TRUE); /* means recognize comments */
  
                  /* If the next thing is a quantifier, it applies to this
                   * character only, which means that this character has to be in
@@ -12327,39 +12325,11 @@ tryagain:
  }
  
  STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
-    const char *e = RExC_end;
-
-    PERL_ARGS_ASSERT_REGWHITE;
-
-    while (p < e) {
-       if (isSPACE(*p))
-           ++p;
-       else if (*p == '#') {
-            bool ended = 0;
-           do {
-               if (*p++ == '\n') {
-                   ended = 1;
-                   break;
-               }
-           } while (p < e);
-           if (!ended)
-                RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
-       }
-       else
-           break;
-    }
-    return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
  {
      /* Returns the next non-pattern-white space, non-comment character (the
       * latter only if 'recognize_comment is true) in the string p, which is
-     * ended by RExC_end.  If there is no line break ending a comment,
-     * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+     * ended by RExC_end.  See also reg_skipcomment */
      const char *e = RExC_end;
  
      PERL_ARGS_ASSERT_REGPATWS;
@@ -12370,16 +12340,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
             p += len;
          }
         else if (recognize_comment && *p == '#') {
-            bool ended = 0;
-           do {
-                p++;
-                if (is_LNBREAK_safe(p, e, UTF)) {
-                   ended = 1;
-                   break;
-               }
-           } while (p < e);
-           if (!ended)
-                RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+            p = reg_skipcomment(pRExC_state, p);
         }
         else
             break;
@@ -12707,11 +12668,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
          while (RExC_parse < RExC_end) {
              SV* current = NULL;
              RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                TRUE); /* means recognize comments */
+                                          TRUE); /* means recognize comments */
              switch (*RExC_parse) {
                  case '?':
                      if (RExC_parse[1] == '[') depth++, RExC_parse++;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
                  default:
                      break;
                  case '\\':
@@ -12824,7 +12785,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
  
          /* Skip white space */
          RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                TRUE); /* means recognize comments */
+                                         TRUE /* means recognize comments */ );
          if (RExC_parse >= RExC_end) {
              Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
          }
@@ -12898,7 +12859,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                      RExC_flags = save_flags;
                      goto handle_operand;
                  }
-                /* FALL THROUGH */
+                /* FALLTHROUGH */
  
              default:
                  RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
@@ -12994,7 +12955,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                  top_index -= 2;
                  SvREFCNT_dec_NN(lparen);
  
-                /* FALL THROUGH */
+                /* FALLTHROUGH */
              }
  
                handle_operand:
@@ -13296,7 +13257,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      if (skip_white) {
          RExC_parse = regpatws(pRExC_state, RExC_parse,
-                              FALSE /* means don't recognize comments */);
+                              FALSE /* means don't recognize comments */ );
      }
  
      if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
@@ -13306,7 +13267,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          RExC_naughty++;
          if (skip_white) {
              RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */);
+                                  FALSE /* means don't recognize comments */ );
          }
      }
  
@@ -13344,7 +13305,7 @@ parseit:
  
          if (skip_white) {
              RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */);
+                                  FALSE /* means don't recognize comments */ );
          }
  
          if  (UCHARAT(RExC_parse) == ']') {
@@ -13855,7 +13816,7 @@ parseit:
  
          if (skip_white) {
              RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                FALSE /* means don't recognize comments */);
+                                FALSE /* means don't recognize comments */ );
          }
  
         if (range) {
@@ -14240,6 +14201,26 @@ parseit:
                      op = POSIXA;
                  }
              }
+            else if (prevvalue == 'A') {
+                if (value == 'Z'
+#ifdef EBCDIC
+                    && literal_endpoint == 2
+#endif
+                ) {
+                    arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+                    op = POSIXA;
+                }
+            }
+            else if (prevvalue == 'a') {
+                if (value == 'z'
+#ifdef EBCDIC
+                    && literal_endpoint == 2
+#endif
+                ) {
+                    arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+                    op = POSIXA;
+                }
+            }
          }
  
          /* Here, we have changed <op> away from its initial value iff we found
@@ -14923,6 +14904,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
         av_store(av, 0, (runtime_defns)
                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
         if (swash) {
+           assert(cp_list);
             av_store(av, 1, swash);
             SvREFCNT_dec_NN(cp_list);
         }
@@ -14951,35 +14933,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
  
  /* reg_skipcomment()
  
-   Absorbs an /x style # comments from the input stream.
-   Returns true if there is more text remaining in the stream.
-   Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
-   terminates the pattern without including a newline.
+   Absorbs an /x style # comment from the input stream,
+   returning a pointer to the first character beyond the comment, or if the
+   comment terminates the pattern without anything following it, this returns
+   one past the final character of the pattern (in other words, RExC_end) and
+   sets the REG_RUN_ON_COMMENT_SEEN flag.
  
-   Note its the callers responsibility to ensure that we are
+   Note it's the callers responsibility to ensure that we are
     actually in /x mode
  
  */
  
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
  {
-    bool ended = 0;
-
      PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
  
-    while (RExC_parse < RExC_end)
-        if (*RExC_parse++ == '\n') {
-            ended = 1;
-            break;
+    assert(*p = '#');
+
+    while (p < RExC_end) {
+        if (*(++p) == '\n') {
+            return p+1;
          }
-    if (!ended) {
-        /* we ran off the end of the pattern without ending
-           the comment, so we have to add an \n when wrapping */
-        RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
-        return 0;
-    } else
-        return 1;
+    }
+
+    /* we ran off the end of the pattern without ending the comment, so we have
+     * to add an \n when wrapping */
+    RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+    return p;
  }
  
  /* nextchar()
@@ -15017,16 +14998,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
             continue;
         }
         if (RExC_flags & RXf_PMf_EXTENDED) {
-           if (isSPACE(*RExC_parse)) {
-               RExC_parse++;
-               continue;
-           }
-           else if (*RExC_parse == '#') {
-               if ( reg_skipcomment( pRExC_state ) )
-                   continue;
-           }
+            char * p = regpatws(pRExC_state, RExC_parse,
+                                          TRUE); /* means recognize comments */
+            if (p != RExC_parse) {
+                RExC_parse = p;
+                continue;
+            }
         }
-       return retval;
+        return retval;
      }
  }
  
@@ -16346,12 +16325,12 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
                  * when the corresponding reg_ac_data struct is freed.
                  */
                 reti->regstclass= ri->regstclass;
-               /* Fall through */
+               /* FALLTHROUGH */
             case 't':
                 OP_REFCNT_LOCK;
                 ((reg_trie_data*)ri->data->data[i])->refcount++;
                 OP_REFCNT_UNLOCK;
-               /* Fall through */
+               /* FALLTHROUGH */
             case 'l':
             case 'L':
                 d->data[i] = ri->data->data[i];
@@ -16612,6 +16591,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
          last= plast;
  
      while (PL_regkind[op] != END && (!last || node < last)) {
+        assert(node);
         /* While that wasn't END last time... */
         NODE_ALIGN(node);
         op = OP(node);