This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Net::SMTP can't send large messages with bleadperl
[perl5.git] / regexec.c
index c65c33b..a4c5aee 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -285,9 +285,8 @@ S_regcppop(pTHX_ const regexp *rex)
      * requiring null fields (pat.t#187 and split.t#{13,14}
      * (as of patchlevel 7877)  will fail.  Then again,
      * this code seems to be necessary or otherwise
-     * building DynaLoader will fail:
-     * "Error: '*' not in typemap in DynaLoader.xs, line 164"
-     * --jhi */
+     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+     * --jhi updated by dapm */
     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
        if (i > PL_regsize)
            PL_regoffs[i].start = -1;
@@ -308,7 +307,7 @@ S_regcppop(pTHX_ const regexp *rex)
  - pregexec - match a regexp against a string
  */
 I32
-Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
         char *strbeg, I32 minend, SV *screamer, U32 nosave)
 /* strend: pointer to null at end of string */
 /* strbeg: real beginning of string */
@@ -372,8 +371,8 @@ Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *stren
    deleted from the finite automaton. */
 
 char *
-Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
-                    char *strend, U32 flags, re_scream_pos_data *data)
+Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
+                    char *strend, const U32 flags, re_scream_pos_data *data)
 {
     dVAR;
     register I32 start_shift = 0;
@@ -1110,6 +1109,15 @@ REXEC_FBC_SCAN(                                       \
 if ((!reginfo || regtry(reginfo, &s))) \
     goto got_it
 
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
+    if (do_utf8) {                                             \
+       REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
+    }                                                          \
+    else {                                                     \
+       REXEC_FBC_CLASS_SCAN(CoNd);                            \
+    }                                                          \
+    break
+    
 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
     if (do_utf8) {                                             \
        UtFpReLoAd;                                            \
@@ -1425,6 +1433,31 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                !isDIGIT_LC_utf8((U8*)s),
                !isDIGIT_LC(*s)
            );
+       case LNBREAK:
+           REXEC_FBC_CSCAN(
+               is_LNBREAK_utf8(s),
+               is_LNBREAK_latin1(s)
+           );
+       case VERTWS:
+           REXEC_FBC_CSCAN(
+               is_VERTWS_utf8(s),
+               is_VERTWS_latin1(s)
+           );
+       case NVERTWS:
+           REXEC_FBC_CSCAN(
+               !is_VERTWS_utf8(s),
+               !is_VERTWS_latin1(s)
+           );
+       case HORIZWS:
+           REXEC_FBC_CSCAN(
+               is_HORIZWS_utf8(s),
+               is_HORIZWS_latin1(s)
+           );
+       case NHORIZWS:
+           REXEC_FBC_CSCAN(
+               !is_HORIZWS_utf8(s),
+               !is_HORIZWS_latin1(s)
+           );      
        case AHOCORASICKC:
        case AHOCORASICK: 
            {
@@ -1671,7 +1704,7 @@ S_swap_match_buff (pTHX_ regexp *prog) {
  - regexec_flags - match a regexp against a string
  */
 I32
-Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
              char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
 /* strend: pointer to null at end of string */
 /* strbeg: real beginning of string */
@@ -1811,7 +1844,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                    if (regtry(&reginfo, &s))
                        goto got_it;
                  after_try:
-                   if (s >= end)
+                   if (s > end)
                        goto phooey;
                    if (prog->extflags & RXf_USE_INTUIT) {
                        s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
@@ -2233,13 +2266,12 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
      * Actually, the code in regcppop() (which Ilya may be meaning by
      * PL_reglastparen), is not needed at all by the test suite
-     * (op/regexp, op/pat, op/split), but that code is needed, oddly
-     * enough, for building DynaLoader, or otherwise this
-     * "Error: '*' not in typemap in DynaLoader.xs, line 164"
-     * will happen.  Meanwhile, this code *is* needed for the
+     * (op/regexp, op/pat, op/split), but that code is needed otherwise
+     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+     * Meanwhile, this code *is* needed for the
      * above-mentioned test suite tests to succeed.  The common theme
      * on those tests seems to be returning null fields from matches.
-     * --jhi */
+     * --jhi updated by dapm */
 #if 1
     if (prog->nparens) {
        regexp_paren_pair *pp = PL_regoffs;
@@ -3207,8 +3239,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                      * pack("U0U*", 0xDF) =~ /ss/i,
                      * the 0xC3 0x9F are the UTF-8
                      * byte sequence for the U+00DF. */
+
                     if (!(do_utf8 &&
-                          toLOWER(s[0]) == 's' &&
+                          toLOWER(s[0]) == 's' &&
                           ln >= 2 &&
                           toLOWER(s[1]) == 's' &&
                           (U8)l[0] == 0xC3 &&
@@ -3663,14 +3696,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
                    }
                    else {
-                       STRLEN len;
-                       const char * const t = SvPV_const(ret, len);
-                       PMOP pm;
+                       U32 pm_flags = 0;
                        const I32 osize = PL_regsize;
 
-                       Zero(&pm, 1, PMOP);
-                       if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
-                       re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
+                       if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
+                       re = CALLREGCOMP(ret, pm_flags);
                        if (!(SvFLAGS(ret)
                              & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
                                | SVs_GMG)))
@@ -4189,12 +4219,6 @@ NULL
 
        case BRANCH:        /*  /(...|A|...)/ */
            scan = NEXTOPER(scan); /* scan now points to inner node */
-           if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ)) 
-               && !has_cutgroup)
-           {
-               /* last branch; skip state push and jump direct to node */
-               continue;
-            }
            ST.lastparen = *PL_reglastparen;
            ST.next_branch = next;
            REGCP_SET(ST.cp);
@@ -4972,6 +4996,55 @@ NULL
             sayNO;
             /* NOTREACHED */
 #undef ST
+        case FOLDCHAR:
+            n = ARG(scan);
+            if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
+                locinput += ln;
+            } else if ( 0xDF == n && !do_utf8 && !UTF ) {
+                sayNO;
+            } else  {
+                U8 folded[UTF8_MAXBYTES_CASE+1];
+                STRLEN foldlen;
+                const char * const l = locinput;
+                char *e = PL_regeol;
+                to_uni_fold(n, folded, &foldlen);
+
+               if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
+                              l, &e, 0,  do_utf8)) {
+                        sayNO;
+                }
+                locinput = e;
+            } 
+            nextchr = UCHARAT(locinput);  
+            break;
+        case LNBREAK:
+            if ((n=is_LNBREAK(locinput,do_utf8))) {
+                locinput += n;
+                nextchr = UCHARAT(locinput);
+            } else
+                sayNO;
+            break;
+
+#define CASE_CLASS(nAmE)                              \
+        case nAmE:                                    \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                locinput += n;                        \
+                nextchr = UCHARAT(locinput);          \
+            } else                                    \
+                sayNO;                                \
+            break;                                    \
+        case N##nAmE:                                 \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                sayNO;                                \
+            } else {                                  \
+                locinput += UTF8SKIP(locinput);       \
+                nextchr = UCHARAT(locinput);          \
+            }                                         \
+            break
+
+        CASE_CLASS(VERTWS);
+        CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
 
        default:
            PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
@@ -5383,7 +5456,77 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            while (scan < loceol && !isDIGIT(*scan))
                scan++;
        }
+    case LNBREAK:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+               scan += c;
+               hardcount++;
+           }
+       } else {
+           /*
+             LNBREAK can match two latin chars, which is ok,
+             because we have a null terminated string, but we
+             have to use hardcount in this situation
+           */
+           while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+               scan+=c;
+               hardcount++;
+           }
+       }       
        break;
+    case HORIZWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+               scan += c;
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && is_HORIZWS_latin1(scan)) 
+               scan++;         
+       }       
+       break;
+    case NHORIZWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !is_HORIZWS_latin1(scan))
+               scan++;
+
+       }       
+       break;
+    case VERTWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+               scan += c;
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && is_VERTWS_latin1(scan)) 
+               scan++;
+
+       }       
+       break;
+    case NVERTWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !is_VERTWS_latin1(scan)) 
+               scan++;
+          
+       }       
+       break;
+
     default:           /* Called on something of 0 width. */
        break;          /* So match right here or not at all. */
     }