This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Eliminate prelen from struct regexp. Possibly we are hardcoding a bit
[perl5.git] / regcomp.c
index bcbfec3..ff099f5 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1163,11 +1163,19 @@ is the recommended Unicode-aware way of saying
 
 #define TRIE_STORE_REVCHAR                                                 \
     STMT_START {                                                           \
-       SV *tmp = newSVpvs("");                                            \
-       if (UTF) SvUTF8_on(tmp);                                           \
-       Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc );                       \
-       av_push( revcharmap, tmp );                                        \
-    } STMT_END
+       if (UTF) {                                                         \
+           SV *zlopp = newSV(2);                                          \
+           unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
+           unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+           SvCUR_set(zlopp, kapow - flrbbbbb);                            \
+           SvPOK_on(zlopp);                                               \
+           SvUTF8_on(zlopp);                                              \
+           av_push(revcharmap, zlopp);                                    \
+       } else {                                                           \
+           char ooooff = (char)uvc;                                               \
+           av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
+       }                                                                  \
+        } STMT_END
 
 #define TRIE_READ_CHAR STMT_START {                                           \
     wordlen++;                                                                \
@@ -1405,7 +1413,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                     /* store the codepoint in the bitmap, and if its ascii
                        also store its folded equivelent. */
                     TRIE_BITMAP_SET(trie,uvc);
-                    if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+
+                   /* store the folded codepoint */
+                   if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+
+                   if ( !UTF ) {
+                       /* store first byte of utf8 representation of
+                          codepoints in the 127 < uvc < 256 range */
+                       if (127 < uvc && uvc < 192) {
+                           TRIE_BITMAP_SET(trie,194);
+                       } else if (191 < uvc ) {
+                           TRIE_BITMAP_SET(trie,195);
+                       /* && uvc < 256 -- we know uvc is < 256 already */
+                       }
+                   }
                     set_bit = 0; /* We've done our bit :-) */
                 }
             } else {
@@ -2003,24 +2024,35 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                 trie->startstate = state;
                 trie->minlen -= (state - 1);
                 trie->maxlen -= (state - 1);
-                DEBUG_r({
-                    regnode *fix = convert;
-                    U32 word = trie->wordcount;
-                    mjd_nodelen++;
-                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
-                    while( ++fix < n ) {
-                        Set_Node_Offset_Length(fix, 0, 0);
-                    }
-                    while (word--) {
-                        SV ** const tmp = av_fetch( trie_words, word, 0 );
-                        if (tmp) {
-                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
-                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
-                            else
-                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
-                        }
-                    }    
-                });
+#ifdef DEBUGGING
+               /* At least the UNICOS C compiler choked on this
+                * being argument to DEBUG_r(), so let's just have
+                * it right here. */
+               if (
+#ifdef PERL_EXT_RE_BUILD
+                   1
+#else
+                   DEBUG_r_TEST
+#endif
+                   ) {
+                   regnode *fix = convert;
+                   U32 word = trie->wordcount;
+                   mjd_nodelen++;
+                   Set_Node_Offset_Length(convert, mjd_offset, state - 1);
+                   while( ++fix < n ) {
+                       Set_Node_Offset_Length(fix, 0, 0);
+                   }
+                   while (word--) {
+                       SV ** const tmp = av_fetch( trie_words, word, 0 );
+                       if (tmp) {
+                           if ( STR_LEN(convert) <= SvCUR(*tmp) )
+                               sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
+                           else
+                               sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
+                       }
+                   }
+               }
+#endif
                 if (trie->maxlen) {
                     convert = n;
                } else {
@@ -2759,7 +2791,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                     last = cur;
                                 }
                             } else {
-                                if ( last ) {
+/* 
+    Currently we assume that the trie can handle unicode and ascii
+    matches fold cased matches. If this proves true then the following
+    define will prevent tries in this situation. 
+    
+    #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+*/
+#define TRIE_TYPE_IS_SAFE 1
+                                if ( last && TRIE_TYPE_IS_SAFE ) {
                                     make_trie( pRExC_state, 
                                             startbranch, first, cur, tail, count, 
                                             optype, depth+1 );
@@ -2787,7 +2827,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
 
                         });
-                        if ( last ) {
+                        
+                        if ( last && TRIE_TYPE_IS_SAFE ) {
                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
 #ifdef TRIE_STUDY_OPT  
                             if ( ((made == MADE_EXACT_TRIE && 
@@ -3297,7 +3338,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                        SvUTF8(sv) && SvMAGICAL(sv) ?
                                        mg_find(sv, PERL_MAGIC_utf8) : NULL;
                                    if (mg && mg->mg_len >= 0)
-                                       mg->mg_len += CHR_SVLEN(last_str);
+                                       mg->mg_len += CHR_SVLEN(last_str) - l;
                                }
                                data->last_end += l * (mincount - 1);
                            }
@@ -4240,7 +4281,6 @@ redo_first_pass:
     RXi_SET( r, ri );
     r->engine= RE_ENGINE_PTR;
     r->refcnt = 1;
-    r->prelen = plen;
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
@@ -4249,7 +4289,7 @@ redo_first_pass:
        U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
        const char *fptr = STD_PAT_MODS;        /*"msix"*/
        char *p;
-        r->wraplen = r->prelen + has_minus + has_p + has_runon
+        r->wraplen = plen + has_minus + has_p + has_runon
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
@@ -4277,9 +4317,10 @@ redo_first_pass:
         }
 
         *p++ = ':';
-        Copy(RExC_precomp, p, r->prelen, char);
-        r->precomp = p;
-        p += r->prelen;
+        Copy(RExC_precomp, p, plen, char);
+       assert ((r->wrapped - p) < 16);
+       r->pre_prefix = p - r->wrapped;
+        p += plen;
         if (has_runon)
             *p++ = '\n';
         *p++ = ')';
@@ -4753,23 +4794,27 @@ reStudy:
         r->paren_names = NULL;
 
 #ifdef STUPID_PATTERN_CHECKS            
-    if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
+    if (RX_PRELEN(r) == 0)
+        r->extflags |= RXf_NULL;
+    if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
         /* XXX: this should happen BEFORE we compile */
         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 
-    else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3))
+    else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3))
         r->extflags |= RXf_WHITE;
-    else if (r->prelen == 1 && r->precomp[0] == '^')
+    else if (RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == '^')
         r->extflags |= RXf_START_ONLY;
 #else
-    if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
+    if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
             /* XXX: this should happen BEFORE we compile */
             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 
     else {
         regnode *first = ri->program + 1;
-        char fop = OP(first);
-        char nop = OP(NEXTOPER(first));
+        U8 fop = OP(first);
+        U8 nop = OP(NEXTOPER(first));
         
-         if (PL_regkind[fop] == BOL && nop == END) 
+        if (PL_regkind[fop] == NOTHING && nop == END)
+            r->extflags |= RXf_NULL;
+        else if (PL_regkind[fop] == BOL && nop == END)
             r->extflags |= RXf_START_ONLY;
         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
             r->extflags |= RXf_WHITE;    
@@ -4919,9 +4964,13 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
 SV*
 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
 {
-    (void)hv_iterinit(rx->paren_names);
+    if ( rx && rx->paren_names ) {
+       (void)hv_iterinit(rx->paren_names);
 
-    return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
+       return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
+    } else {
+       return FALSE;
+    }
 }
 
 SV*
@@ -5160,7 +5209,7 @@ SV*
 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
 {
        PERL_UNUSED_ARG(rx);
-       return newSVpvs("Regexp");
+       return NULL;
 }
 
 /* Scans the name of a named buffer from the pattern.
@@ -6467,6 +6516,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
             | PERL_SCAN_DISALLOW_PREFIX
             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
         UV cp;
+       char string;
         len = (STRLEN)(endbrace - name - 2);
         cp = grok_hex(name + 2, &len, &fl, NULL);
         if ( len != (STRLEN)(endbrace - name - 2) ) {
@@ -6478,7 +6528,8 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
             *valuep = cp;
             return NULL;
         }
-        sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
+       string = (char)cp;
+        sv_str= newSVpvn(&string, 1);
     } else {
         /* fetch the charnames handler for this scope */
         HV * const table = GvHV(PL_hintgv);
@@ -6825,8 +6876,10 @@ tryagain:
     case 0xDF:
     case 0xC3:
     case 0xCE:
+        do_foldchar:
         if (!LOC && FOLD) {
             U32 len,cp;
+           len=0; /* silence a spurious compiler warning */
             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
@@ -6850,7 +6903,11 @@ tryagain:
           required, as the default for this switch is to jump to the
           literal text handling code.
        */
-       switch (*++RExC_parse) {
+       switch ((U8)*++RExC_parse) {
+       case 0xDF:
+       case 0xC3:
+       case 0xCE:
+                  goto do_foldchar;        
        /* Special Escapes */
        case 'A':
            RExC_seen_zerolen++;
@@ -6866,6 +6923,11 @@ tryagain:
            RExC_seen_zerolen++;
            ret = reg_node(pRExC_state, KEEPS);
            *flagp |= SIMPLE;
+           /* XXX:dmq : disabling in-place substitution seems to
+            * be necessary here to avoid cases of memory corruption, as
+            * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+            */
+           RExC_seen |= REG_SEEN_LOOKBEHIND;
            goto finish_meta_pat;
        case 'Z':
            ret = reg_node(pRExC_state, SEOL);
@@ -7052,6 +7114,8 @@ tryagain:
                        goto parse_named_seq;
                }   }
                num = atoi(RExC_parse);
+               if (isg && num == 0)
+                   vFAIL("Reference to invalid group 0");
                 if (isrel) {
                     num = RExC_npar - num;
                     if (num < 1)
@@ -7161,8 +7225,13 @@ tryagain:
                       an unescaped equivalent literal.
                    */
 
-                   switch (*++p) {
+                   switch ((U8)*++p) {
                    /* These are all the special escapes. */
+                   case 0xDF:
+                   case 0xC3:
+                   case 0xCE:
+                          if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
+                               goto normal_default;                
                    case 'A':             /* Start assertion */
                    case 'b': case 'B':   /* Word-boundary assertion*/
                    case 'C':             /* Single char !DANGEROUS! */
@@ -8775,19 +8844,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     k = PL_regkind[OP(o)];
 
     if (k == EXACT) {
-       SV * const dsv = sv_2mortal(newSVpvs(""));
+       sv_catpvs(sv, " ");
        /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 
         * is a crude hack but it may be the best for now since 
         * we have no flag "this EXACTish node was UTF-8" 
         * --jhi */
-       const char * const s = 
-           pv_pretty(dsv, STRING(o), STR_LEN(o), 60, 
-               PL_colors[0], PL_colors[1],
-               PERL_PV_ESCAPE_UNI_DETECT |
-               PERL_PV_PRETTY_ELIPSES    |
-               PERL_PV_PRETTY_LTGT    
-            ); 
-       Perl_sv_catpvf(aTHX_ sv, " %s", s );
+       pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
+                 PERL_PV_ESCAPE_UNI_DETECT |
+                 PERL_PV_PRETTY_ELLIPSES   |
+                 PERL_PV_PRETTY_LTGT       |
+                 PERL_PV_PRETTY_NOCLEAR
+                 );
     } else if (k == TRIE) {
        /* print the details of the trie in dumpuntil instead, as
         * progi->data isn't available here */
@@ -8816,7 +8883,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
             int i;
             int rangestart = -1;
             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
-            Perl_sv_catpvf(aTHX_ sv, "[");
+            sv_catpvs(sv, "[");
             for (i = 0; i <= 256; i++) {
                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
                     if (rangestart == -1)
@@ -8833,7 +8900,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
                     rangestart = -1;
                 }
             }
-            Perl_sv_catpvf(aTHX_ sv, "]");
+            sv_catpvs(sv, "]");
         } 
         
     } else if (k == CURLY) {
@@ -8876,7 +8943,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     } else if (k == LOGICAL)
        Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
     else if (k == FOLDCHAR)
-       Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]",ARG(o) );        
+       Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
     else if (k == ANYOF) {
        int i, rangestart = -1;
        const U8 flags = ANYOF_FLAGS(o);
@@ -9177,7 +9244,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const r)
        {
            SV *dsv= sv_newmortal();
             RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
-                dsv, r->precomp, r->prelen, 60);
+                dsv, RX_PRECOMP(r), RX_PRELEN(r), 60);
             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 
                 PL_colors[4],PL_colors[5],s);
         }
@@ -9353,7 +9420,6 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
     }
 
     ret->wrapped        = SAVEPVN(ret->wrapped, ret->wraplen+1);
-    ret->precomp        = ret->wrapped + (ret->precomp - ret->wrapped);
     ret->paren_names    = hv_dup_inc(ret->paren_names, param);
 
     if (ret->pprivate)
@@ -9644,12 +9710,24 @@ clear_re(pTHX_ void *r)
 STATIC void
 S_put_byte(pTHX_ SV *sv, int c)
 {
-    if (isCNTRL(c) || c == 255 || !isPRINT(c))
+    /* Our definition of isPRINT() ignores locales, so only bytes that are
+       not part of UTF-8 are considered printable. I assume that the same
+       holds for UTF-EBCDIC.
+       Also, code point 255 is not printable in either (it's E0 in EBCDIC,
+       which Wikipedia says:
+
+       EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
+       ones (binary 1111 1111, hexadecimal FF). It is similar, but not
+       identical, to the ASCII delete (DEL) or rubout control character.
+       ) So the old condition can be simplified to !isPRINT(c)  */
+    if (!isPRINT(c))
        Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
-    else if (c == '-' || c == ']' || c == '\\' || c == '^')
-       Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
-    else
-       Perl_sv_catpvf(aTHX_ sv, "%c", c);
+    else {
+       const char string = c;
+       if (c == '-' || c == ']' || c == '\\' || c == '^')
+           sv_catpvs(sv, "\\");
+       sv_catpvn(sv, &string, 1);
+    }
 }
 
 
@@ -9752,7 +9830,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
                            PL_colors[0], PL_colors[1],
                            (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
-                           PERL_PV_PRETTY_ELIPSES    |
+                           PERL_PV_PRETTY_ELLIPSES    |
                            PERL_PV_PRETTY_LTGT
                             )
                             : "???"