This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
On VMS, put pods in [.lib.pods] rather than [.lib.pod]
[perl5.git] / regexec.c
index 8b305e8..13cc68b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -55,7 +55,6 @@
 #  define PERL_NO_GET_CONTEXT
 #endif
 
-/*SUPPRESS 112*/
 /*
  * pregcomp and pregexec -- regsub and regerror are not used in perl
  *
@@ -417,8 +416,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     char *check_at = Nullch;           /* check substr found at this pos */
     const I32 multiline = prog->reganch & PMf_MULTILINE;
 #ifdef DEBUGGING
-    char *i_strpos = strpos;
-    SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+    const char * const i_strpos = strpos;
+    SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
 #endif
 
     GET_RE_DEBUG_FLAGS_DECL;
@@ -519,6 +518,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                     || ((slen = SvCUR(check)) > 1
                         && memNE(SvPVX_const(check), s, slen)))
                goto report_neq;
+           check_at = s;
            goto success_at_start;
          }
        }
@@ -616,7 +616,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
          do_other_anchored:
            {
-               char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
+               char * const last = HOP3c(s, -start_shift, strbeg);
+               char *last1, *last2;
                char *s1 = s;
                SV* must;
 
@@ -862,7 +863,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
           regstclass does not come from lookahead...  */
        /* If regstclass takes bytelength more than 1: If charlength==1, OK.
           This leaves EXACTF only, which is dealt with in find_byclass().  */
-        const U8* str = (U8*)STRING(prog->regstclass);
+        const U8* const str = (U8*)STRING(prog->regstclass);
         const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
                    ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
                    : 1);
@@ -1039,14 +1040,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
                U8 *sm = (U8 *) m;
                U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
                U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
+               const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
 
                to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
                to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
                c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 
-                                   0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+                                   0, uniflags);
                c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
-                                   0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+                                   0, uniflags);
                lnc = 0;
                while (sm < ((U8 *) m + ln)) {
                    lnc++;
@@ -1085,14 +1087,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
                UV c, f;
                U8 tmpbuf [UTF8_MAXBYTES+1];
                STRLEN len, foldlen;
-               
+               const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
                if (c1 == c2) {
                    /* Upper and lower of 1st char are equal -
                     * probably not a "letter". */
                    while (s <= e) {
                        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
-                                          ckWARN(WARN_UTF8) ?
-                                          0 : UTF8_ALLOW_ANY);
+                                          uniflags);
                        if ( c == c1
                             && (ln == len ||
                                 ibcmp_utf8(s, (char **)0, 0,  do_utf8,
@@ -1119,8 +1120,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
                else {
                    while (s <= e) {
                      c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
-                                          ckWARN(WARN_UTF8) ?
-                                          0 : UTF8_ALLOW_ANY);
+                                          uniflags);
 
                        /* Handle some of the three Greek sigmas cases.
                         * Note that not all the possible combinations
@@ -1651,7 +1651,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
 
     GET_RE_DEBUG_FLAGS_DECL;
 
-    (void)data; /* Currently unused */
+    PERL_UNUSED_ARG(data);
     RX_MATCH_UTF8_set(prog,do_utf8);
 
     PL_regcc = 0;
@@ -1849,7 +1849,6 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                                   "Did not find anchored character...\n")
                );
     }
-    /*SUPPRESS 560*/
     else if (prog->anchored_substr != Nullsv
              || prog->anchored_utf8 != Nullsv
              || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
@@ -1949,8 +1948,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
        }
        DEBUG_EXECUTE_r({
            SV *prop = sv_newmortal();
-           char *s0;
-           char *s1;
+           const char *s0;
+           const char *s1;
            int len0;
            int len1;
 
@@ -1958,7 +1957,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
            s0 = UTF ?
              pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
                             UNI_DISPLAY_REGEX) :
-             SvPVX(prop);
+             SvPVX_const(prop);
            len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
            s1 = UTF ?
              sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
@@ -1994,7 +1993,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
            }
            else {
                STRLEN len;
-                const char * const little = SvPV(float_real, len);
+                const char * const little = SvPV_const(float_real, len);
 
                if (SvTAIL(float_real)) {
                    if (memEQ(strend - len + 1, little, len - 1))
@@ -2070,7 +2069,7 @@ got_it:
                                  (int) SvTYPE(sv));
                }
                prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
-               prog->subbeg = SvPVX(prog->saved_copy);
+               prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
                assert (SvPOKp(prog->saved_copy));
            } else
 #endif
@@ -2148,7 +2147,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos)
            SAVEDESTRUCTOR_X(restore_pos, 0);
         }
         if (!PL_reg_curpm) {
-           Newz(22, PL_reg_curpm, 1, PMOP);
+           Newxz(PL_reg_curpm, 1, PMOP);
 #ifdef USE_ITHREADS
             {
                 SV* repointer = newSViv(0);
@@ -2194,7 +2193,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos)
         if(PL_reg_start_tmp)
             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
         else
-            New(22, PL_reg_start_tmp, PL_reg_start_tmpl, char*);
+            Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
     }
 
     /* XXXX What this code is doing here?!!!  There should be no need
@@ -2409,7 +2408,7 @@ S_regmatch(pTHX_ regnode *prog)
 #if 0
     I32 firstcp = PL_savestack_ix;
 #endif
-    const register bool do_utf8 = PL_reg_match_utf8;
+    register const bool do_utf8 = PL_reg_match_utf8;
 #ifdef DEBUGGING
     SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
     SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
@@ -2417,6 +2416,7 @@ S_regmatch(pTHX_ regnode *prog)
 
     SV *re_debug_flags = NULL;
 #endif
+    U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
 
     GET_RE_DEBUG_FLAGS;
 
@@ -2584,8 +2584,6 @@ S_regmatch(pTHX_ regnode *prog)
        case TRIEF:
        case TRIEFL:
            {
-
-               const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
                U8 *uc = ( U8* )locinput;
                U32 state = 1;
                U16 charid = 0;
@@ -2614,7 +2612,7 @@ S_regmatch(pTHX_ regnode *prog)
 
                    if ( base ) {
 
-                       if ( do_utf8 || UTF ) {
+                       if ( do_utf8 ) {
                            if ( foldlen>0 ) {
                                uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
                                foldlen -= len;
@@ -2653,7 +2651,6 @@ S_regmatch(pTHX_ regnode *prog)
               from previous if blocks */
        case TRIE:
            {
-               const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
                U8 *uc = (U8*)locinput;
                U32 state = 1;
                U16 charid = 0;
@@ -2680,7 +2677,7 @@ S_regmatch(pTHX_ regnode *prog)
 
                    if ( base ) {
 
-                       if ( do_utf8 || UTF ) {
+                       if ( do_utf8 ) {
                            uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
                        } else {
                            uvc = (U32)*uc;
@@ -2727,7 +2724,7 @@ S_regmatch(pTHX_ regnode *prog)
                            "%*s  %sonly one match : #%d <%s>%s\n",
                            REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
                            accept_buff[ 0 ].wordnum,
-                           tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
+                           tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
                            PL_colors[5] );
                    });
                    PL_reginput = (char *)accept_buff[ 0 ].endpos;
@@ -2762,7 +2759,7 @@ S_regmatch(pTHX_ regnode *prog)
                            PerlIO_printf( Perl_debug_log, "%*s  %strying alternation #%d <%s> at 0x%p%s\n",
                                REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
                                accept_buff[best].wordnum,
-                               tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
+                               tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
                                PL_colors[5] );
                        });
                        if ( best<accepted ) {
@@ -2809,8 +2806,7 @@ S_regmatch(pTHX_ regnode *prog)
                             sayNO;
                        if (NATIVE_TO_UNI(*(U8*)s) !=
                            utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
-                                          ckWARN(WARN_UTF8) ?
-                                          0 : UTF8_ALLOW_ANY))
+                                           uniflags))
                             sayNO;
                        l += ulen;
                        s ++;
@@ -2824,8 +2820,7 @@ S_regmatch(pTHX_ regnode *prog)
                            sayNO;
                        if (NATIVE_TO_UNI(*((U8*)l)) !=
                            utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
-                                          ckWARN(WARN_UTF8) ?
-                                          0 : UTF8_ALLOW_ANY))
+                                          uniflags))
                            sayNO;
                        s += ulen;
                        l ++;
@@ -3556,7 +3551,7 @@ S_regmatch(pTHX_ regnode *prog)
                    }
                    else {
                        PL_reg_poscache_size = size;
-                       Newz(29, PL_reg_poscache, size, char);
+                       Newxz(PL_reg_poscache, size, char);
                    }
                    DEBUG_EXECUTE_r(
                        PerlIO_printf(Perl_debug_log,
@@ -3942,16 +3937,13 @@ S_regmatch(pTHX_ regnode *prog)
                             to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
                             c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
-                                                ckWARN(WARN_UTF8) ?
-                                                0 : UTF8_ALLOW_ANY);
+                                                uniflags);
                             c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
-                                                ckWARN(WARN_UTF8) ?
-                                                0 : UTF8_ALLOW_ANY);
+                                                uniflags);
                        }
                        else {
                            c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
-                                                    ckWARN(WARN_UTF8) ?
-                                                    0 : UTF8_ALLOW_ANY);
+                                                    uniflags);
                        }
                    }
                }
@@ -4012,8 +4004,7 @@ S_regmatch(pTHX_ regnode *prog)
                                while (locinput <= e &&
                                       utf8n_to_uvchr((U8*)locinput,
                                                      UTF8_MAXBYTES, &len,
-                                                     ckWARN(WARN_UTF8) ?
-                                                     0 : UTF8_ALLOW_ANY) != (UV)c1) {
+                                                     uniflags) != (UV)c1) {
                                    locinput += len;
                                    count++;
                                }
@@ -4024,8 +4015,7 @@ S_regmatch(pTHX_ regnode *prog)
                                while (locinput <= e) {
                                    UV c = utf8n_to_uvchr((U8*)locinput,
                                                          UTF8_MAXBYTES, &len,
-                                                         ckWARN(WARN_UTF8) ?
-                                                         0 : UTF8_ALLOW_ANY);
+                                                         uniflags);
                                    if (c == (UV)c1 || c == (UV)c2)
                                        break;
                                    locinput += len;
@@ -4061,8 +4051,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (do_utf8)
                            c = utf8n_to_uvchr((U8*)PL_reginput,
                                               UTF8_MAXBYTES, 0,
-                                              ckWARN(WARN_UTF8) ?
-                                              0 : UTF8_ALLOW_ANY);
+                                              uniflags);
                        else
                            c = UCHARAT(PL_reginput);
                        /* If it could work, try it. */
@@ -4111,8 +4100,7 @@ S_regmatch(pTHX_ regnode *prog)
                            if (do_utf8)
                                c = utf8n_to_uvchr((U8*)PL_reginput,
                                                   UTF8_MAXBYTES, 0,
-                                                  ckWARN(WARN_UTF8) ?
-                                                  0 : UTF8_ALLOW_ANY);
+                                                  uniflags);
                            else
                                c = UCHARAT(PL_reginput);
                        }
@@ -4134,8 +4122,7 @@ S_regmatch(pTHX_ regnode *prog)
                            if (do_utf8)
                                c = utf8n_to_uvchr((U8*)PL_reginput,
                                                   UTF8_MAXBYTES, 0,
-                                                  ckWARN(WARN_UTF8) ?
-                                                  0 : UTF8_ALLOW_ANY);
+                                                  uniflags);
                            else
                                c = UCHARAT(PL_reginput);
                        }
@@ -4328,7 +4315,6 @@ do_no:
                goto do_no;
            }
            /* Have more choice yet.  Reuse the same uwb.  */
-           /*SUPPRESS 560*/
            if ((n = (uwb->type == RE_UNWIND_BRANCH
                      ? NEXT_OFF(next) : ARG(next))))
                next += n;
@@ -4673,16 +4659,16 @@ Perl_regclass_swash(pTHX_ register const regnode* node, bool doinit, SV** listsv
        const U32 n = ARG(node);
 
        if (PL_regdata->what[n] == 's') {
-           SV *rv = (SV*)PL_regdata->data[n];
-           AV *av = (AV*)SvRV((SV*)rv);
-           SV **ary = AvARRAY(av);
+           SV * const rv = (SV*)PL_regdata->data[n];
+           AV * const av = (AV*)SvRV((SV*)rv);
+           SV **const ary = AvARRAY(av);
            SV **a, **b;
        
-           /* See the end of regcomp.c:S_reglass() for
+           /* See the end of regcomp.c:S_regclass() for
             * documentation of these array elements. */
 
            si = *ary;
-           a  = SvTYPE(ary[1]) == SVt_RV   ? &ary[1] : 0;
+           a  = SvROK(ary[1]) ? &ary[1] : 0;
            b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
 
            if (a)
@@ -4724,9 +4710,13 @@ S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp
     STRLEN len = 0;
     STRLEN plen;
 
-    if (do_utf8 && !UTF8_IS_INVARIANT(c))
-        c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
-                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
+       c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
+                           ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
+                                       UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
+       if (len == (STRLEN)-1)
+           Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+    }
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
@@ -4740,7 +4730,7 @@ S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp
            match = TRUE;
        if (!match) {
            AV *av;
-           SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
+           SV * const sw = regclass_swash(n, TRUE, 0, (SV**)&av);
        
            if (sw) {
                if (swash_fetch(sw, p, do_utf8))
@@ -4748,11 +4738,10 @@ S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp
                else if (flags & ANYOF_FOLD) {
                    if (!match && lenp && av) {
                        I32 i;
-                     
                        for (i = 0; i <= av_len(av); i++) {
-                           SV* sv = *av_fetch(av, i, FALSE);
+                           SV* const sv = *av_fetch(av, i, FALSE);
                            STRLEN len;
-                           const char *s = SvPV(sv, len);
+                           const char * const s = SvPV_const(sv, len);
                        
                            if (len <= plen && memEQ(s, (char*)p, len)) {
                                *lenp = len;
@@ -4903,7 +4892,7 @@ S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
 static void
 restore_pos(pTHX_ void *arg)
 {
-    (void)arg; /* unused */
+    PERL_UNUSED_ARG(arg);
     if (PL_reg_eval_set) {
        if (PL_reg_oldsaved) {
            PL_reg_re->subbeg = PL_reg_oldsaved;
@@ -4922,8 +4911,8 @@ restore_pos(pTHX_ void *arg)
 STATIC void
 S_to_utf8_substr(pTHX_ register regexp *prog)
 {
-    SV* sv;
     if (prog->float_substr && !prog->float_utf8) {
+       SV* sv;
        prog->float_utf8 = sv = newSVsv(prog->float_substr);
        sv_utf8_upgrade(sv);
        if (SvTAIL(prog->float_substr))
@@ -4932,6 +4921,7 @@ S_to_utf8_substr(pTHX_ register regexp *prog)
            prog->check_utf8 = sv;
     }
     if (prog->anchored_substr && !prog->anchored_utf8) {
+       SV* sv;
        prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
        sv_utf8_upgrade(sv);
        if (SvTAIL(prog->anchored_substr))
@@ -4944,8 +4934,8 @@ S_to_utf8_substr(pTHX_ register regexp *prog)
 STATIC void
 S_to_byte_substr(pTHX_ register regexp *prog)
 {
-    SV* sv;
     if (prog->float_utf8 && !prog->float_substr) {
+       SV* sv;
        prog->float_substr = sv = newSVsv(prog->float_utf8);
        if (sv_utf8_downgrade(sv, TRUE)) {
            if (SvTAIL(prog->float_utf8))
@@ -4958,6 +4948,7 @@ S_to_byte_substr(pTHX_ register regexp *prog)
            prog->check_substr = sv;
     }
     if (prog->anchored_utf8 && !prog->anchored_substr) {
+       SV* sv;
        prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
        if (sv_utf8_downgrade(sv, TRUE)) {
            if (SvTAIL(prog->anchored_utf8))