This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Optimize /[a-z]/ and /[A-Z]/
[perl5.git] / regcomp.c
index 94dfd55..236cb24 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2007,6 +2007,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
     });
 
     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+    assert(re_trie_maxbuff);
     if (!SvIOK(re_trie_maxbuff)) {
         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
     }
@@ -4389,7 +4390,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (flags & SCF_DO_SUBSTR)
                    data->pos_min++;
                min++;
-               /* Fall through. */
+               /* FALLTHROUGH */
            case STAR:
                if (flags & SCF_DO_STCLASS) {
                    mincount = 0;
@@ -4861,7 +4862,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 
                case NPOSIXL:
                     invert = 1;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
 
                case POSIXL:
                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
@@ -4902,7 +4903,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                 case NPOSIXA:   /* For these, we always know the exact set of
                                    what's matched */
                     invert = 1;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
                case POSIXA:
                     if (FLAGS(scan) == _CC_ASCII) {
                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
@@ -4917,7 +4918,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                case NPOSIXD:
                case NPOSIXU:
                     invert = 1;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
                case POSIXD:
                case POSIXU:
                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
@@ -9411,8 +9412,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             int internal_argval = 0; /* internal_argval is only useful if
                                         !argok */
 
-            if (has_intervening_patws && SIZE_ONLY) {
-                ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+            if (has_intervening_patws) {
+                RExC_parse++;
+                vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
             }
            while ( *RExC_parse && *RExC_parse != ')' ) {
                if ( *RExC_parse == ':' ) {
@@ -9518,8 +9520,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         else if (*RExC_parse == '?') { /* (?...) */
            bool is_logical = 0;
            const char * const seqstart = RExC_parse;
-            if (has_intervening_patws && SIZE_ONLY) {
-                ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+            if (has_intervening_patws) {
+                RExC_parse++;
+                vFAIL("In '(?...)', the '(' and '?' must be adjacent");
             }
 
            RExC_parse++;
@@ -9734,7 +9737,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     RExC_parse--; /* rewind to let it be handled later */
                     goto parse_flags;
                 }
-                /*FALLTHROUGH */
+                /* FALLTHROUGH */
             case '1': case '2': case '3': case '4': /* (?1) */
            case '5': case '6': case '7': case '8': case '9':
                RExC_parse--;
@@ -9806,7 +9809,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                }
                *flagp |= POSTPONED;
                paren = *RExC_parse++;
-               /* FALL THROUGH */
+               /* FALLTHROUGH */
            case '{':           /* (?{...}) */
            {
                U32 n = 0;
@@ -10151,7 +10154,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
        case '=':
        case '!':
            *flagp &= ~HASWIDTH;
-           /* FALL THROUGH */
+           /* FALLTHROUGH */
        case '>':
            ender = reg_node(pRExC_state, SUCCEED);
            break;
@@ -10673,7 +10676,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
      * modifier.  The other meaning does not, so use a temporary until we find
      * out which we are being called with */
     p = (RExC_flags & RXf_PMf_EXTENDED)
-       ? regwhite( pRExC_state, RExC_parse )
+       ? regpatws(pRExC_state, RExC_parse,
+                                TRUE) /* means recognize comments */
        : RExC_parse;
 
     /* Disambiguate between \N meaning a named character versus \N meaning
@@ -11264,7 +11268,7 @@ tryagain:
            RExC_parse++;
            goto defchar;
        }
-       /* FALL THROUGH */
+       /* FALLTHROUGH */
     case '?':
     case '+':
     case '*':
@@ -11427,7 +11431,7 @@ tryagain:
             }
 
            *flagp |= HASWIDTH|SIMPLE;
-            /* FALL THROUGH */
+            /* FALLTHROUGH */
 
          finish_meta_pat:
            nextchar(pRExC_state);
@@ -11626,7 +11630,7 @@ tryagain:
        case '\0':
            if (RExC_parse >= RExC_end)
                FAIL("Trailing \\");
-           /* FALL THROUGH */
+           /* FALLTHROUGH */
        default:
            /* Do not generate "unrecognized" warnings here, we fall
               back into the quick-grab loop below */
@@ -11637,10 +11641,11 @@ tryagain:
 
     case '#':
        if (RExC_flags & RXf_PMf_EXTENDED) {
-           if ( reg_skipcomment( pRExC_state ) )
+           RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+           if (RExC_parse < RExC_end)
                goto tryagain;
        }
-       /* FALL THROUGH */
+       /* FALLTHROUGH */
 
     default:
 
@@ -11718,7 +11723,8 @@ tryagain:
                oldp = p;
 
                if (RExC_flags & RXf_PMf_EXTENDED)
-                   p = regwhite( pRExC_state, p );
+                    p = regpatws(pRExC_state, p,
+                                          TRUE); /* means recognize comments */
                switch ((U8)*p) {
                case '^':
                case '$':
@@ -11932,7 +11938,7 @@ tryagain:
                    case '\0':
                        if (p >= RExC_end)
                            FAIL("Trailing \\");
-                       /* FALL THROUGH */
+                       /* FALLTHROUGH */
                    default:
                        if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
                            /* Include any { following the alpha to emphasize
@@ -11946,15 +11952,6 @@ tryagain:
                    break;
                default:    /* A literal character */
 
-                    if (! SIZE_ONLY
-                        && RExC_flags & RXf_PMf_EXTENDED
-                        && ckWARN_d(WARN_DEPRECATED)
-                        && is_PATWS_non_low_safe(p, RExC_end, UTF))
-                    {
-                        vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
-                                "Escape literal pattern white space under /x");
-                    }
-
                  normal_default:
                    if (UTF8_IS_START(*p) && UTF) {
                        STRLEN numlen;
@@ -11972,7 +11969,8 @@ tryagain:
                 */
 
                if ( RExC_flags & RXf_PMf_EXTENDED)
-                   p = regwhite( pRExC_state, p );
+                    p = regpatws(pRExC_state, p,
+                                          TRUE); /* means recognize comments */
 
                 /* If the next thing is a quantifier, it applies to this
                  * character only, which means that this character has to be in
@@ -12327,39 +12325,11 @@ tryagain:
 }
 
 STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
-    const char *e = RExC_end;
-
-    PERL_ARGS_ASSERT_REGWHITE;
-
-    while (p < e) {
-       if (isSPACE(*p))
-           ++p;
-       else if (*p == '#') {
-            bool ended = 0;
-           do {
-               if (*p++ == '\n') {
-                   ended = 1;
-                   break;
-               }
-           } while (p < e);
-           if (!ended)
-                RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
-       }
-       else
-           break;
-    }
-    return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
 {
     /* Returns the next non-pattern-white space, non-comment character (the
      * latter only if 'recognize_comment is true) in the string p, which is
-     * ended by RExC_end.  If there is no line break ending a comment,
-     * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+     * ended by RExC_end.  See also reg_skipcomment */
     const char *e = RExC_end;
 
     PERL_ARGS_ASSERT_REGPATWS;
@@ -12370,16 +12340,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
            p += len;
         }
        else if (recognize_comment && *p == '#') {
-            bool ended = 0;
-           do {
-                p++;
-                if (is_LNBREAK_safe(p, e, UTF)) {
-                   ended = 1;
-                   break;
-               }
-           } while (p < e);
-           if (!ended)
-                RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+            p = reg_skipcomment(pRExC_state, p);
        }
        else
            break;
@@ -12707,11 +12668,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         while (RExC_parse < RExC_end) {
             SV* current = NULL;
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                TRUE); /* means recognize comments */
+                                          TRUE); /* means recognize comments */
             switch (*RExC_parse) {
                 case '?':
                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
-                    /* FALL THROUGH */
+                    /* FALLTHROUGH */
                 default:
                     break;
                 case '\\':
@@ -12824,7 +12785,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
 
         /* Skip white space */
         RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                TRUE); /* means recognize comments */
+                                         TRUE /* means recognize comments */ );
         if (RExC_parse >= RExC_end) {
             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
         }
@@ -12898,7 +12859,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                     RExC_flags = save_flags;
                     goto handle_operand;
                 }
-                /* FALL THROUGH */
+                /* FALLTHROUGH */
 
             default:
                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
@@ -12994,7 +12955,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                 top_index -= 2;
                 SvREFCNT_dec_NN(lparen);
 
-                /* FALL THROUGH */
+                /* FALLTHROUGH */
             }
 
               handle_operand:
@@ -13296,7 +13257,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     if (skip_white) {
         RExC_parse = regpatws(pRExC_state, RExC_parse,
-                              FALSE /* means don't recognize comments */);
+                              FALSE /* means don't recognize comments */ );
     }
 
     if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
@@ -13306,7 +13267,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         RExC_naughty++;
         if (skip_white) {
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */);
+                                  FALSE /* means don't recognize comments */ );
         }
     }
 
@@ -13344,7 +13305,7 @@ parseit:
 
         if (skip_white) {
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */);
+                                  FALSE /* means don't recognize comments */ );
         }
 
         if  (UCHARAT(RExC_parse) == ']') {
@@ -13855,7 +13816,7 @@ parseit:
 
         if (skip_white) {
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                FALSE /* means don't recognize comments */);
+                                FALSE /* means don't recognize comments */ );
         }
 
        if (range) {
@@ -14240,6 +14201,26 @@ parseit:
                     op = POSIXA;
                 }
             }
+            else if (prevvalue == 'A') {
+                if (value == 'Z'
+#ifdef EBCDIC
+                    && literal_endpoint == 2
+#endif
+                ) {
+                    arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+                    op = POSIXA;
+                }
+            }
+            else if (prevvalue == 'a') {
+                if (value == 'z'
+#ifdef EBCDIC
+                    && literal_endpoint == 2
+#endif
+                ) {
+                    arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+                    op = POSIXA;
+                }
+            }
         }
 
         /* Here, we have changed <op> away from its initial value iff we found
@@ -14923,6 +14904,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
        av_store(av, 0, (runtime_defns)
                        ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
        if (swash) {
+           assert(cp_list);
            av_store(av, 1, swash);
            SvREFCNT_dec_NN(cp_list);
        }
@@ -14951,35 +14933,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
 
 /* reg_skipcomment()
 
-   Absorbs an /x style # comments from the input stream.
-   Returns true if there is more text remaining in the stream.
-   Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
-   terminates the pattern without including a newline.
+   Absorbs an /x style # comment from the input stream,
+   returning a pointer to the first character beyond the comment, or if the
+   comment terminates the pattern without anything following it, this returns
+   one past the final character of the pattern (in other words, RExC_end) and
+   sets the REG_RUN_ON_COMMENT_SEEN flag.
 
-   Note its the callers responsibility to ensure that we are
+   Note it's the callers responsibility to ensure that we are
    actually in /x mode
 
 */
 
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
 {
-    bool ended = 0;
-
     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
 
-    while (RExC_parse < RExC_end)
-        if (*RExC_parse++ == '\n') {
-            ended = 1;
-            break;
+    assert(*p = '#');
+
+    while (p < RExC_end) {
+        if (*(++p) == '\n') {
+            return p+1;
         }
-    if (!ended) {
-        /* we ran off the end of the pattern without ending
-           the comment, so we have to add an \n when wrapping */
-        RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
-        return 0;
-    } else
-        return 1;
+    }
+
+    /* we ran off the end of the pattern without ending the comment, so we have
+     * to add an \n when wrapping */
+    RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+    return p;
 }
 
 /* nextchar()
@@ -15017,16 +14998,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
            continue;
        }
        if (RExC_flags & RXf_PMf_EXTENDED) {
-           if (isSPACE(*RExC_parse)) {
-               RExC_parse++;
-               continue;
-           }
-           else if (*RExC_parse == '#') {
-               if ( reg_skipcomment( pRExC_state ) )
-                   continue;
-           }
+            char * p = regpatws(pRExC_state, RExC_parse,
+                                          TRUE); /* means recognize comments */
+            if (p != RExC_parse) {
+                RExC_parse = p;
+                continue;
+            }
        }
-       return retval;
+        return retval;
     }
 }
 
@@ -16346,12 +16325,12 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
                 * when the corresponding reg_ac_data struct is freed.
                 */
                reti->regstclass= ri->regstclass;
-               /* Fall through */
+               /* FALLTHROUGH */
            case 't':
                OP_REFCNT_LOCK;
                ((reg_trie_data*)ri->data->data[i])->refcount++;
                OP_REFCNT_UNLOCK;
-               /* Fall through */
+               /* FALLTHROUGH */
            case 'l':
            case 'L':
                d->data[i] = ri->data->data[i];
@@ -16612,6 +16591,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
         last= plast;
 
     while (PL_regkind[op] != END && (!last || node < last)) {
+        assert(node);
        /* While that wasn't END last time... */
        NODE_ALIGN(node);
        op = OP(node);