X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/304ee84bde82d4eee33b0d0ff03080b360eae72b..0a96133fa53fe1b2ee03b5cce4cdd9c7c96a0867:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 359e4f6..ce4104a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2,7 +2,9 @@ */ /* - * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] */ /* This file contains functions for compiling a regular expression. See @@ -57,7 +59,8 @@ **** Alterations to Henry's code are... **** **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, - **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -102,6 +105,7 @@ typedef struct RExC_state_t { U32 flags; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object pprivate field */ char *start; /* Start of input for compile */ @@ -109,6 +113,7 @@ typedef struct RExC_state_t { char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the allocated space */ regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ @@ -123,9 +128,12 @@ typedef struct RExC_state_t { regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ - I32 utf8; - HV *charnames; /* cache of named sequences */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ HV *paren_names; /* Paren names */ + regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ #if ADD_TO_REGEXEC @@ -135,22 +143,28 @@ typedef struct RExC_state_t { #ifdef DEBUGGING const char *lastparse; I32 lastnum; + AV *paren_name_list; /* idx -> name */ #define RExC_lastparse (pRExC_state->lastparse) #define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) #endif } RExC_state_t; #define RExC_flags (pRExC_state->flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) #define RExC_start (pRExC_state->start) #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) -#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */ +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) #define RExC_naughty (pRExC_state->naughty) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) @@ -161,7 +175,7 @@ typedef struct RExC_state_t { #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) -#define RExC_charnames (pRExC_state->charnames) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) #define RExC_open_parens (pRExC_state->open_parens) #define RExC_close_parens (pRExC_state->close_parens) #define RExC_opend (pRExC_state->opend) @@ -169,6 +183,7 @@ typedef struct RExC_state_t { #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) @@ -180,10 +195,11 @@ typedef struct RExC_state_t { * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ -#define HASWIDTH 0x1 /* Known to match non-null strings. */ -#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 0x4 /* Starts with * or +. */ -#define TRYAGAIN 0x8 /* Weeded out a declaration. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ +#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 0x04 /* Starts with * or +. */ +#define TRYAGAIN 0x08 /* Weeded out a declaration. */ +#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -376,7 +392,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -407,7 +423,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -425,7 +441,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -444,7 +460,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -457,23 +473,22 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define vWARN(loc,m) STMT_START { \ +#define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define vWARNdep(loc,m) STMT_START { \ +#define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END - -#define vWARN2(loc, m, a1) STMT_START { \ +#define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -483,12 +498,24 @@ static const scan_data_t zero_scan_data = a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ @@ -507,7 +534,21 @@ static const scan_data_t zero_scan_data = * Element 0 holds the number n. * Position is 1 indexed. */ - +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x #define Set_Node_Offset_To_R(node,byte) STMT_START { \ if (! SIZE_ONLY) { \ MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ @@ -550,11 +591,11 @@ static const scan_data_t zero_scan_data = Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ } STMT_END - +#endif #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN -#endif +#endif /*RE_TRACK_PATTERN_OFFSETS*/ #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ @@ -603,6 +644,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min const STRLEN old_l = CHR_SVLEN(*data->longest); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_SCAN_COMMIT; + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { SvSetMagicSV(*data->longest, data->last_found); if (*data->longest == data->longest_fixed) { @@ -642,13 +685,15 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("cl_anything: ",data,0); + DEBUG_STUDYDATA("commit: ",data,0); } /* Can match anything (initialization) */ STATIC void S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_ANYTHING; + ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; @@ -662,6 +707,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) { int value; + PERL_ARGS_ASSERT_CL_IS_ANYTHING; + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; @@ -676,6 +723,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) STATIC void S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -684,6 +733,8 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) STATIC void S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT_ZERO; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -697,6 +748,7 @@ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) { + PERL_ARGS_ASSERT_CL_AND; assert(and_with->type == ANYOF); if (!(and_with->flags & ANYOF_CLASS) @@ -735,6 +787,8 @@ S_cl_and(struct regnode_charclass_class *cl, STATIC void S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { + PERL_ARGS_ASSERT_CL_OR; + if (or_with->flags & ANYOF_INVERT) { /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -826,6 +880,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_DUMP_TRIE; PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", (int)depth * 2 + 2,"", @@ -908,6 +963,9 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + /* print out the table precompression. */ PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", @@ -963,6 +1021,8 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; /* print out the table precompression so that we can do a visual check @@ -1136,11 +1196,19 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = newSVpvs(""); \ - if (UTF) SvUTF8_on(tmp); \ - Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ - av_push( revcharmap, tmp ); \ - } STMT_END + if (UTF) { \ + SV *zlopp = newSV(2); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)uvc; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END #define TRIE_READ_CHAR STMT_START { \ wordlen++; \ @@ -1195,10 +1263,9 @@ is the recommended Unicode-aware way of saying /* store the word for dumping */ \ SV* tmp; \ if (OP(noper) != NOTHING) \ - tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ else \ - tmp = newSVpvn( "", 0 ); \ - if ( UTF ) SvUTF8_on( tmp ); \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ av_push( trie_words, tmp ); \ }); \ \ @@ -1206,7 +1273,7 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1220,7 +1287,7 @@ is the recommended Unicode-aware way of saying /* we only allocate the nextword buffer when there */\ /* a dupe, so first time we have to do the allocation */\ if (!trie->nextword) \ - trie->nextword = \ + trie->nextword = (U16 *) \ PerlMemShared_calloc( word_count + 1, sizeof(U16)); \ while ( trie->nextword[dupe] ) \ dupe= trie->nextword[dupe]; \ @@ -1283,18 +1350,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif SV *re_trie_maxbuff; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif - trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); trie->refcount = 1; trie->startstate = 1; trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; - trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) ); + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); if (!(UTF && folder)) - trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); DEBUG_r({ trie_words = newAV(); }); @@ -1329,7 +1398,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store unicode characters. We use the + (trie->charmap) and we use a an HV* to store Unicode characters. We use the native representation of the character value as the key and IV's for the coded index. @@ -1352,16 +1421,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ - STRLEN chars=0; + STRLEN chars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ if (OP(noper) == NOTHING) { trie->minlen= 0; continue; } - if (trie->bitmap) { - TRIE_BITMAP_SET(trie,*uc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]); - } + if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; @@ -1373,6 +1443,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; TRIE_STORE_REVCHAR; } + if ( set_bit ) { + /* store the codepoint in the bitmap, and if its ascii + also store its folded equivelent. */ + TRIE_BITMAP_SET(trie,uvc); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } + set_bit = 0; /* We've done our bit :-) */ + } } else { SV** svpp; if ( !widecharmap ) @@ -1406,7 +1496,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, (int)trie->minlen, (int)trie->maxlen ) ); - trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) ); + trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) ); /* We now know what we are dealing with in terms of unique chars and @@ -1449,8 +1539,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs "%*sCompiling trie using list compiler\n", (int)depth * 2 + 2, "")); - trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, - sizeof(reg_trie_state) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); TRIE_LIST_NEW(1); next_alloc = 2; @@ -1514,8 +1605,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* next alloc is the NEXT state to be allocated */ trie->statecount = next_alloc; - trie->states = PerlMemShared_realloc( trie->states, next_alloc - * sizeof(reg_trie_state) ); + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); /* and now dump it out before we compress it */ DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, @@ -1523,8 +1616,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs depth+1) ); - trie->trans - = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); { U32 state; U32 tp = 0; @@ -1555,8 +1648,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( transcount < tp + maxid - minid + 1) { transcount *= 2; - trie->trans - = PerlMemShared_realloc( trie->trans, + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); @@ -1638,11 +1731,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs "%*sCompiling trie using table compiler\n", (int)depth * 2 + 2, "")); - trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) - * trie->uniquecharcount + 1, - sizeof(reg_trie_trans) ); - trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, - sizeof(reg_trie_state) ); + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); next_alloc = trie->uniquecharcount + 1; @@ -1799,8 +1894,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } } trie->lasttrans = pos + 1; - trie->states = PerlMemShared_realloc( trie->states, laststate - * sizeof(reg_trie_state) ); + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", @@ -1820,8 +1916,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs (UV)trie->lasttrans) ); /* resize the trans array to remove unused space */ - trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans - * sizeof(reg_trie_trans) ); + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); /* and now dump out the compressed format */ DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); @@ -1832,9 +1929,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #ifdef DEBUGGING regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + U32 mjd_offset = 0; U32 mjd_nodelen = 0; -#endif +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ /* This means we convert either the first branch or the first Exact, depending on whether the thing following (in 'last') is a branch @@ -1847,25 +1947,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( first != startbranch || OP( last ) == BRANCH ) { /* branch sub-chain */ NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_r({ mjd_offset= Node_Offset((convert)); mjd_nodelen= Node_Length((convert)); }); +#endif /* whole branch chain */ - } else { + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { DEBUG_r({ const regnode *nop = NEXTOPER( convert ); mjd_offset= Node_Offset((nop)); mjd_nodelen= Node_Length((nop)); }); } - DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); - +#endif /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; @@ -1904,7 +2007,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( folder ) TRIE_BITMAP_SET(trie, folder[ *ch ]); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, (char*)ch) + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) ); } } @@ -1918,7 +2021,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( count == 1 ) { SV **tmp = av_fetch( revcharmap, idx, 0); - char *ch = SvPV_nolen( *tmp ); + STRLEN len; + char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); PerlIO_printf( Perl_debug_log, @@ -1937,11 +2041,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs str=STRING(convert); STR_LEN(convert)=0; } - while (*ch) { + STR_LEN(convert) += len; + while (len--) *str++ = *ch++; - STR_LEN(convert)++; - } - } else { #ifdef DEBUGGING if (state>1) @@ -1956,24 +2058,35 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->startstate = state; trie->minlen -= (state - 1); trie->maxlen -= (state - 1); - DEBUG_r({ - regnode *fix = convert; - U32 word = trie->wordcount; - mjd_nodelen++; - Set_Node_Offset_Length(convert, mjd_offset, state - 1); - while( ++fix < n ) { - Set_Node_Offset_Length(fix, 0, 0); - } - while (word--) { - SV ** const tmp = av_fetch( trie_words, word, 0 ); - if (tmp) { - if ( STR_LEN(convert) <= SvCUR(*tmp) ) - sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); - else - sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); - } - } - }); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif if (trie->maxlen) { convert = n; } else { @@ -2017,6 +2130,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* needed for dumping*/ DEBUG_r(if (optimize) { regnode *opt = convert; + while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } @@ -2082,19 +2196,21 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, 1, "T" ); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif ARG_SET( stclass, data_slot ); - aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); RExC_rxi->data->data[ data_slot ] = (void*)aho; aho->trie=trie_offset; aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); Copy( trie->states, aho->states, numstates, reg_trie_state ); Newxz( q, numstates, U32); - aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); aho->refcount = 1; fail = aho->fail; /* initialize fail[0..1] to be 1 so that we always have @@ -2198,6 +2314,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags #else PERL_UNUSED_ARG(depth); #endif + + PERL_ARGS_ASSERT_JOIN_EXACT; #ifndef EXPERIMENTAL_INPLACESCAN PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(val); @@ -2351,6 +2469,34 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -2380,9 +2526,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first_non_open = scan; I32 stopmin = I32_MAX; scan_frame *frame = NULL; - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_STUDY_CHUNK; + #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif @@ -2683,7 +2830,20 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last = cur; } } else { - if ( last ) { +/* + Currently we do not believe that the trie logic can + handle case insensitive matching properly when the + pattern is not unicode (thus forcing unicode semantics). + + If/when this is fixed the following define can be swapped + in below to fully enable trie logic. + +#define TRIE_TYPE_IS_SAFE 1 + +*/ +#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) + + if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, startbranch, first, cur, tail, count, optype, depth+1 ); @@ -2711,7 +2871,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + + if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && @@ -3032,11 +3193,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3 /* Complement check for big count */ - && ckWARN(WARN_REGEXP)) + && maxcount <= REG_INFTY/3) /* Complement check for big count */ { - vWARN(RExC_parse, - "Quantifier unexpected on zero-length expression"); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); } min += minnext * mincount; @@ -3201,9 +3361,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ - last_str = newSVpvn(s + old, l); - if (UTF) - SvUTF8_on(last_str); + last_str = newSVpvn_utf8(s + old, l, UTF); if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { @@ -3221,7 +3379,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str); + mg->mg_len += CHR_SVLEN(last_str) - l; } data->last_end += l * (mincount - 1); } @@ -3281,6 +3439,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_VERTWS_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_VERTWS_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 1; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + + } + else if (OP(scan) == FOLDCHAR) { + int d = ARG(scan)==0xDF ? 1 : 2; + flags &= ~SCF_DO_STCLASS; + min += 1; + delta += d; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += d; + data->longest = &(data->longest_float); + } + } else if (strchr((const char*)PL_simple,OP(scan))) { int value = 0; @@ -3475,6 +3673,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -3539,11 +3740,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (f & SCF_DO_STCLASS_AND) { - const int was = (data->start_class->flags & ANYOF_EOS); - - cl_and(data->start_class, &intrnl); - if (was) - data->start_class->flags |= ANYOF_EOS; + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + cl_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue */ + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } } } #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY @@ -3845,6 +4057,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ + /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -3889,6 +4102,8 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + PERL_ARGS_ASSERT_ADD_DATA; + Renewc(RExC_rxi->data, sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), char, struct reg_data); @@ -3965,39 +4180,45 @@ Perl_reginitcolors(pTHX) extern const struct regexp_engine my_reg_engine; #define RE_ENGINE_PTR &my_reg_engine #endif -/* these make a few things look better, to avoid indentation */ -#define BEGIN_BLOCK { -#define END_BLOCK } - -regexp * -Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) + +#ifndef PERL_IN_XSUB_RE +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { dVAR; - GET_RE_DEBUG_FLAGS_DECL; - DEBUG_r(if (!PL_colorset) reginitcolors()); -#ifndef PERL_IN_XSUB_RE - BEGIN_BLOCK + HV * const table = GvHV(PL_hintgv); + + PERL_ARGS_ASSERT_PREGCOMP; + /* Dispatch a request to compile a regexp to correct regexp engine. */ - HV * const table = GvHV(PL_hintgv); if (table) { SV **ptr= hv_fetchs(table, "regcomp", FALSE); + GET_RE_DEBUG_FLAGS_DECL; if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); DEBUG_COMPILE_r({ PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", SvIV(*ptr)); }); - return CALLREGCOMP_ENG(eng, exp, xend, pm); + return CALLREGCOMP_ENG(eng, pattern, flags); } } - END_BLOCK + return Perl_re_compile(aTHX_ pattern, flags); +} #endif - BEGIN_BLOCK - register regexp *r; + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) +{ + dVAR; + REGEXP *rx; + struct regexp *r; register regexp_internal *ri; + STRLEN plen; + char *exp = SvPV(pattern, plen); + char* xend = exp + plen; regnode *scan; - regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; @@ -4009,20 +4230,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) int restudied= 0; RExC_state_t copyRExC_state; #endif - if (exp == NULL) - FAIL("NULL regexp argument"); + GET_RE_DEBUG_FLAGS_DECL; - RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + PERL_ARGS_ASSERT_RE_COMPILE; + + DEBUG_r(if (!PL_colorset) reginitcolors()); + + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); - RExC_precomp = exp; DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, RExC_precomp, (xend - exp), 60); + dsv, exp, plen, 60); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); - RExC_flags = pm->op_pmflags; + +redo_first_pass: + RExC_precomp = exp; + RExC_flags = pm_flags; RExC_sawback = 0; RExC_seen = 0; @@ -4040,11 +4266,13 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_size = 0L; RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; - RExC_charnames = NULL; RExC_open_parens = NULL; RExC_close_parens = NULL; RExC_opend = NULL; RExC_paren_names = NULL; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif RExC_recurse = NULL; RExC_recurse_count = 0; @@ -4057,6 +4285,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_precomp = NULL; return(NULL); } + if (RExC_utf8 && !RExC_orig_utf8) { + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + XXX: somehow figure out how to make this less expensive... + -- dmq */ + STRLEN len = plen; + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8; + SAVEFREEPV(exp); + goto redo_first_pass; + } DEBUG_PARSE_r({ PerlIO_printf(Perl_debug_log, "Required size %"IVdf" nodes\n" @@ -4074,15 +4321,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; -#ifdef DEBUGGING - /* Make room for a sentinel value at the end of the program */ - RExC_size++; -#endif - /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ - Newxz(r, 1, regexp); + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = (struct regexp*)SvANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -4098,10 +4341,55 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* non-zero initialization begins here */ RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; - r->refcnt = 1; - r->prelen = xend - exp; - r->precomp = savepvn(RExC_precomp, r->prelen); - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + r->extflags = pm_flags; + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + const STRLEN wraplen = plen + has_minus + has_p + has_runon + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + p = sv_grow(MUTABLE_SV(rx), wraplen + 1); + SvCUR_set(rx, wraplen); + SvPOK_on(rx); + SvFLAGS(rx) |= SvUTF8(pattern); + *p++='('; *p++='?'; + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; + char *colon = r + 1; + char ch; + + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + else + *r-- = ch; + reganch >>= 1; + } + if(has_minus) { + *r = '-'; + p = colon; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + } + r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -4113,37 +4401,35 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) } /* Useful during FAIL. */ - Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ - if (ri->offsets) { - ri->offsets[0] = RExC_size; - } +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, "%s %"UVuf" bytes for offset annotations.\n", - ri->offsets ? "Got" : "Couldn't get", + ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); - +#endif + SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; RExC_rx = r; RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ + RExC_flags = pm_flags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; RExC_emit_start = ri->program; RExC_emit = ri->program; -#ifdef DEBUGGING - /* put a sentinal on the end of the program so we can check for - overwrites */ - ri->program[RExC_size].type = 255; -#endif + RExC_emit_bound = ri->program + RExC_size + 1; + /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); - if (reg(pRExC_state, 0, &flags,1) == NULL) + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); return(NULL); - + } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4157,7 +4443,10 @@ reStudy: Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT - if ( restudied ) { + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); @@ -4172,19 +4461,17 @@ reStudy: SvREFCNT_dec(data.last_found); } StructCopy(&zero_scan_data, &data, scan_data_t); - } else { - StructCopy(&zero_scan_data, &data, scan_data_t); - copyRExC_state = RExC_state; } #else StructCopy(&zero_scan_data, &data, scan_data_t); #endif /* Dig out information for optimizations. */ - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */ - pm->op_pmflags = RExC_flags; + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + if (UTF) - r->extflags |= RXf_UTF8; /* Unicode in it? */ + SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->intflags |= PREGf_NAUGHTY; @@ -4198,29 +4485,43 @@ reStudy: struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; I32 last_close = 0; /* pointed to by data */ - - first = scan; - /* Skip introductions and multiplicators >= 1. */ + regnode *first= scan; + regnode *first_next= regnext(first); + + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ - (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || + (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ (OP(first) == IFMATCH && !first->flags) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ - (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + */ if (OP(first) == PLUS) sawplus = 1; else first += regarglen[OP(first)]; - if (OP(first) == IFMATCH) { - first = NEXTOPER(first); - first += EXTRA_STEP_2ARGS; - } else /* XXX possible optimisation for /(?=)/ */ - first = NEXTOPER(first); + + first = NEXTOPER(first); + first_next= regnext(first); } /* Starting-point info. */ @@ -4240,12 +4541,12 @@ reStudy: regnode *trie_op; /* this can happen only on restudy */ if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = + struct regnode_1 *trieop = (struct regnode_1 *) PerlMemShared_calloc(1, sizeof(struct regnode_1)); StructCopy(first,trieop,struct regnode_1); trie_op=(regnode *)trieop; } else { - struct regnode_charclass *trieop = + struct regnode_charclass *trieop = (struct regnode_charclass *) PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); StructCopy(first,trieop,struct regnode_charclass); trie_op=(regnode *)trieop; @@ -4345,6 +4646,7 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen + && !(RExC_seen & REG_SEEN_VERBARG) && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) r->extflags |= RXf_CHECK_ALL; scan_commit(pRExC_state, &data,&minlen,0); @@ -4562,141 +4864,529 @@ reStudy: if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (RExC_paren_names) - r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else - r->paren_names = NULL; - + RXp_PAREN_NAMES(r) = NULL; + +#ifdef STUPID_PATTERN_CHECKS + if (RX_PRELEN(rx) == 0) + r->extflags |= RXf_NULL; + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + r->extflags |= RXf_WHITE; + else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') + r->extflags |= RXf_START_ONLY; +#else + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif +#ifdef DEBUGGING + if (RExC_paren_names) { + ri->name_list_idx = add_data( pRExC_state, 1, "p" ); + ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + } else +#endif + ri->name_list_idx = 0; + if (RExC_recurse_count) { for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { const regnode *scan = RExC_recurse[RExC_recurse_count-1]; ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); } } - Newxz(r->startp, RExC_npar, I32); - Newxz(r->endp, RExC_npar, I32); + Newxz(r->offs, RExC_npar, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); - DEBUG_OFFSETS_r(if (ri->offsets) { - const U32 len = ri->offsets[0]; +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const U32 len = ri->u.offsets[0]; U32 i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]); + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { - if (ri->offsets[i*2-1] || ri->offsets[i*2]) + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", - (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]); + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } PerlIO_printf(Perl_debug_log, "\n"); }); - return(r); - END_BLOCK +#endif + return rx; } -#undef CORE_ONLY_BLOCK -#undef END_BLOCK #undef RE_ENGINE_PTR -#ifndef PERL_IN_XSUB_RE + SV* -Perl_reg_named_buff_sv(pTHX_ SV* namesv) +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) { - I32 parno = 0; /* no match */ - if (PL_curpm) { - const REGEXP * const rx = PM_GETRE(PL_curpm); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - for ( i=0; ilastparen) >= nums[i] && - rx->endp[nums[i]] != -1) - { - parno = nums[i]; - break; - } + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak(aTHX_ "%s", PL_no_modify); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + AV *retarray = NULL; + SV *ret; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) + retarray=newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); + if (!retarray) + return ret; + } else { + ret = newSVsv(&PL_sv_undef); } + if (retarray) + av_push(retarray, ret); } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); } } - if ( !parno ) { - return 0; + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } } else { - GV *gv_paren; - SV *sv= sv_newmortal(); - Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno); - gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV); - return GvSVn(gv_paren); + return FALSE; } } -#endif -/* Scans the name of a named buffer from the pattern. - * If flags is REG_RSN_RETURN_NULL returns null. - * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name - * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding - * to the parsed name as looked up in the RExC_paren_names hash. - * If there is an error throws a vFAIL().. type exception. - */ +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); -#define REG_RSN_RETURN_NULL 0 -#define REG_RSN_RETURN_NAME 1 -#define REG_RSN_RETURN_DATA 2 + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; -STATIC SV* -S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { - char *name_start = RExC_parse; - if ( UTF ) { - STRLEN numlen; - while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, - RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT))) - { - RExC_parse += numlen; - } + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); } else { - while( isIDFIRST(*RExC_parse) ) - RExC_parse++; + return FALSE; } - if ( flags ) { - SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, - (int)(RExC_parse - name_start))); - if (UTF) - SvUTF8_on(sv_name); - if ( flags == REG_RSN_RETURN_NAME) - return sv_name; - else if (flags==REG_RSN_RETURN_DATA) { - HE *he_str = NULL; - SV *sv_dat = NULL; - if ( ! sv_name ) /* should not happen*/ - Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); - if (RExC_paren_names) - he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); - if ( he_str ) - sv_dat = HeVAL(he_str); - if ( ! sv_dat ) - vFAIL("Reference to nonexistent named group"); - return sv_dat; - } - else { - Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } } - /* NOT REACHED */ } return NULL; } -#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ - int rem=(int)(RExC_end - RExC_parse); \ - int cut; \ - int num; \ - int iscut=0; \ - if (rem>10) { \ - rem=10; \ +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_len(av); + SvREFCNT_dec(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + char *s = NULL; + I32 i = 0; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if (!rx->subbeg) { + sv_setsv(sv,&PL_sv_undef); + return; + } + else + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { + /* $` */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { + /* $' */ + s = rx->subbeg + rx->offs[0].end; + i = rx->sublen - rx->offs[0].end; + } + else + if ( 0 <= paren && paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + /* $& $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1; + } else { + sv_setsv(sv,&PL_sv_undef); + return; + } + assert(rx->sublen >= (s - rx->subbeg) + i ); + if (i >= 0) { + const int oldtainted = PL_tainted; + TAINT_NOT; + sv_setpvn(sv, s, i); + PL_tainted = oldtainted; + if ( (rx->extflags & RXf_CANY_SEEN) + ? (RXp_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RXp_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (PL_tainting) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + PL_tainted = 1; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + PL_tainted = 1; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak(aTHX_ "%s", PL_no_modify); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + /* Some of this code was originally in C in F */ + switch (paren) { + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + /* $& / ${^MATCH}, $1, $2, ... */ + default: + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} + +/* Scans the name of a named buffer from the pattern. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. + */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ + char *name_start = RExC_parse; + + PERL_ARGS_ASSERT_REG_SCAN_NAME; + + if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + /* skip IDFIRST by using do...while */ + if (UTF) + do { + RExC_parse += UTF8SKIP(RExC_parse); + } while (isALNUM_utf8((U8*)RExC_parse)); + else + do { + RExC_parse++; + } while (isALNUM(*RExC_parse)); + } + + if ( flags ) { + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); + } + /* NOT REACHED */ + } + return NULL; +} + +#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ + int rem=(int)(RExC_end - RExC_parse); \ + int cut; \ + int num; \ + int iscut=0; \ + if (rem>10) { \ + rem=10; \ iscut=1; \ } \ cut=10-rem; \ @@ -4710,7 +5400,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { PerlIO_printf(Perl_debug_log,"%16s",""); \ \ if (SIZE_ONLY) \ - num=RExC_size; \ + num = RExC_size + 1; \ else \ num=REG_NODE_NUM(RExC_emit); \ if (RExC_lastnum!=num) \ @@ -4751,10 +5441,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) #endif -/* this idea is borrowed from STR_WITH_LEN in handy.h */ -#define CHECK_WORD(s,v,l) \ - (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1)))) - STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ @@ -4766,9 +5452,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; /* for (?g), (?gc), and (?o) warnings; warning about (?c) will warn about (?g) -- japhy */ @@ -4783,8 +5471,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char * const oregcomp_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; - DEBUG_PARSE("reg "); + PERL_ARGS_ASSERT_REG; + DEBUG_PARSE("reg "); *flagp = 0; /* Tentatively. */ @@ -4822,39 +5511,39 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ - if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) { + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { op = ACCEPT; internal_argval = RExC_nestroot; } break; case 'C': /* (*COMMIT) */ - if ( CHECK_WORD("COMMIT",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"COMMIT") ) op = COMMIT; break; case 'F': /* (*FAIL) */ - if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) { + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { op = OPFAIL; argok = 0; } break; case ':': /* (*:NAME) */ case 'M': /* (*MARK:NAME) */ - if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) { + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { op = MARKPOINT; argok = -1; } break; case 'P': /* (*PRUNE) */ - if ( CHECK_WORD("PRUNE",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"PRUNE") ) op = PRUNE; break; case 'S': /* (*SKIP) */ - if ( CHECK_WORD("SKIP",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"SKIP") ) op = SKIP; break; case 'T': /* (*THEN) */ /* [19:06] :: is then */ - if ( CHECK_WORD("THEN",start_verb,verb_len) ) { + if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; RExC_seen |= REG_SEEN_CUTGROUP; } @@ -4897,8 +5586,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else if (*RExC_parse == '?') { /* (?...) */ - U32 posflags = 0, negflags = 0; - U32 *flagsp = &posflags; bool is_logical = 0; const char * const seqstart = RExC_parse; @@ -4907,10 +5594,48 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = NULL; /* For look-ahead/behind. */ switch (paren) { - case '<': /* (?<...) */ + case 'P': /* (?P...) variants for those used to PCRE/Python */ + paren = *RExC_parse++; + if ( paren == '<') /* (?P<...>) named capture */ + goto named_capture; + else if (paren == '>') { /* (?P>name) named recursion */ + goto named_recursion; + } + else if (paren == '=') { /* (?P=...) named backref */ + /* this pretty much dupes the code for \k in regatom(), if + you change this make sure you change that */ + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ')') + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, 1, "S" ); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ + + nextchar(pRExC_state); + return ret; + } + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; else if (*RExC_parse != '=') + named_capture: { /* (?<...>) */ char *name_start; SV *svname; @@ -4921,8 +5646,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? /* reverse test from the others */ REG_RSN_RETURN_NAME : REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) - goto unknown; + if (RExC_parse == name_start) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } if (*RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); @@ -4934,7 +5662,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "panic: reg_scan_name returned NULL"); if (!RExC_paren_names) { RExC_paren_names= newHV(); - sv_2mortal((SV*)RExC_paren_names); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); +#ifdef DEBUGGING + RExC_paren_name_list= newAV(); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); +#endif } he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); if ( he_str ) @@ -4944,17 +5676,36 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Perl_croak(aTHX_ "panic: paren_name hash element allocation failed"); } else if ( SvPOK(sv_dat) ) { - IV count=SvIV(sv_dat); - I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1); - SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32)); - pv[count]=RExC_npar; - SvIVX(sv_dat)++; + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); + } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); SvIOK_on(sv_dat); - SvIVX(sv_dat)= 1; + SvIV_set(sv_dat, 1); } +#ifdef DEBUGGING + if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + SvREFCNT_dec(svname); +#endif /*sv_dump(sv_dat);*/ } @@ -4965,6 +5716,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_SEEN_LOOKBEHIND; RExC_parse++; case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; case '!': /* (?!...) */ RExC_seen_zerolen++; if (*RExC_parse == ')') { @@ -4972,6 +5725,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; case ':': /* (?:...) */ case '>': /* (?>...) */ break; @@ -4992,14 +5752,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ { /* named and numeric backreferences */ I32 num; - char * parse_start; case '&': /* (?&NAME) */ parse_start = RExC_parse - 1; + named_recursion: { SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); @@ -5072,19 +5833,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ /* NOT REACHED */ - case 'p': /* (?p...) */ - if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) - vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); - /* FALL THROUGH*/ case '?': /* (??...) */ is_logical = 1; - if (*RExC_parse != '{') - goto unknown; + if (*RExC_parse != '{') { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -5189,7 +5951,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; @@ -5267,6 +6029,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ return ret; } else { @@ -5278,13 +6043,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("Sequence (? incomplete"); break; default: - --RExC_parse; - parse_flags: /* (?i) */ - while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { + --RExC_parse; + parse_flags: /* (?i) */ + { + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ /* (?g), (?gc) and (?o) are useless here and must be globally applied -- japhy */ - - if (*RExC_parse == 'o' || *RExC_parse == 'g') { + switch (*RExC_parse) { + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5299,8 +6071,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else if (*RExC_parse == 'c') { + break; + + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5312,33 +6085,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else { pmflag(flagsp, *RExC_parse); } - - ++RExC_parse; - } - if (*RExC_parse == '-') { - flagsp = &negflags; - wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + if (flagsp == &negflags) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + paren = ':'; + /*FALLTHROUGH*/ + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } + nextchar(pRExC_state); + if (paren != ':') { + *flagp = TRYAGAIN; + return NULL; + } else { + ret = NULL; + goto parse_rest; + } + /*NOTREACHED*/ + default: + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } ++RExC_parse; - goto parse_flags; - } - RExC_flags |= posflags; - RExC_flags &= ~negflags; - if (*RExC_parse == ':') { - RExC_parse++; - paren = ':'; - break; - } - unknown: - if (*RExC_parse != ')') { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); } - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; - } + }} /* one for the default block, one for the switch */ } else { /* (...) */ capturing_parens: @@ -5349,7 +6140,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_SEEN_RECURSE + && !RExC_open_parens[parno-1]) + { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); @@ -5363,10 +6156,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); + + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + /* branch_len = (paren != 0); */ if (br == NULL) @@ -5392,7 +6193,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if (paren != '?') /* Not Conditional */ ret = br; - *flagp |= flags & (SPSTART | HASWIDTH); + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -5402,15 +6203,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) return(NULL); REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); } if (have_branch || paren != ':') { @@ -5503,7 +6307,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } - + if (after_freeze) + RExC_npar = after_freeze; return(ret); } @@ -5521,7 +6326,11 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) register regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + DEBUG_PARSE("brnc"); + if (first) ret = NULL; else { @@ -5550,7 +6359,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } else if (ret == NULL) ret = latest; - *flagp |= flags&HASWIDTH; + *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { @@ -5595,6 +6404,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; const char *maxpos = NULL; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + DEBUG_PARSE("piec"); ret = regatom(pRExC_state, &flags,depth+1); @@ -5672,7 +6484,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (max && max < min) + if (max < min) vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, (U16)min); @@ -5732,11 +6544,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { - vWARN3(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + ckWARN3reg(RExC_parse, + "%.*s matches null string many times", + (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), + origparse); } if (RExC_parse < RExC_end && *RExC_parse == '?') { @@ -5771,267 +6583,242 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* reg_namedseq(pRExC_state,UVp) This is expected to be called by a parser routine that has - recognized'\N' and needs to handle the rest. RExC_parse is + recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. + + The \N may be inside (indicated by valuep not being NULL) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those. The net effect is that if the + beginning of the passed-in pattern isn't '{U+' or there is no '}', it + signals that this \N occurrence means to match a non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. If valuep is non-null then it is assumed that we are parsing inside of a charclass definition and the first codepoint in the resolved string is returned via *valuep and the routine will return NULL. In this mode if a multichar string is returned from the charnames - handler a warning will be issued, and only the first char in the + handler, a warning will be issued, and only the first char in the sequence will be examined. If the string returned is zero length then the value of *valuep is undefined and NON-NULL will be returned to indicate failure. (This will NOT be a valid pointer to a regnode.) - If value is null then it is assumed that we are parsing normal text - and inserts a new EXACT node into the program containing the resolved - string and returns a pointer to the new node. If the string is - zerolength a NOTHING node is emitted. - + If valuep is null then it is assumed that we are parsing normal text and a + new EXACT node is inserted into the program containing the resolved string, + and a pointer to the new node is returned. But if the string is zero length + a NOTHING node is emitted instead. + On success RExC_parse is set to the char following the endbrace. - Parsing failures will generate a fatal errorvia vFAIL(...) - - NOTE: We cache all results from the charnames handler locally in - the RExC_charnames hash (created on first use) to prevent a charnames - handler from playing silly-buggers and returning a short string and - then a long string for a given pattern. Since the regexp program - size is calculated during an initial parse this would result - in a buffer overrun so we cache to prevent the charname result from - changing during the course of the parse. - + Parsing failures will generate a fatal error via vFAIL(...) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) { - char * name; /* start of the content of the name */ char * endbrace; /* endbrace following the name */ - SV *sv_str = NULL; - SV *sv_name = NULL; - STRLEN len; /* this has various purposes throughout the code */ - bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ regnode *ret = NULL; +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; /* points to the '\N' */ +#endif + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMEDSEQ; + + GET_RE_DEBUG_FLAGS; + + /* Disambiguate between \N meaning a named character versus \N meaning + * don't match a newline. */ + if (*RExC_parse != '{' + || (! (endbrace = strchr(RExC_parse, '}'))) /* no trailing brace */ + || ! (endbrace == RExC_parse + 1 /* nothing between the {} */ + || (endbrace - RExC_parse > 3 /* U+ and at least one hex */ + && strnEQ(RExC_parse + 1, "U+", 2)))) + { + if (valuep) { + /* no bare \N in a charclass */ + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + nextchar(pRExC_state); + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + RExC_parse--; + Set_Node_Length(ret, 1); /* MJD */ + return ret; + } + + /* Here, we have decided it is a named sequence */ + RExC_parse++; /* Skip past the '{' */ + if (endbrace == RExC_parse) { /* empty: \N{} */ + if (! valuep) { + RExC_parse = endbrace + 1; + return reg_node(pRExC_state,NOTHING); + } + + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class" + ); + RExC_parse = endbrace + 1; + } + *valuep = 0; + return (regnode *) &RExC_parse; /* Invalid regnode pointer */ + } + + RExC_utf8 = 1; /* named sequences imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + if (valuep) { /* In a bracketed char class */ + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - if (*RExC_parse != '{') { - vFAIL("Missing braces on \\N{}"); - } - name = RExC_parse+1; - endbrace = strchr(RExC_parse, '}'); - if ( ! endbrace ) { - RExC_parse++; - vFAIL("Missing right brace on \\N{}"); - } - RExC_parse = endbrace + 1; - - - /* RExC_parse points at the beginning brace, - endbrace points at the last */ - if ( name[0]=='U' && name[1]=='+' ) { - /* its a "unicode hex" notation {U+89AB} */ - I32 fl = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - UV cp; - len = (STRLEN)(endbrace - name - 2); - cp = grok_hex(name + 2, &len, &fl, NULL); - if ( len != (STRLEN)(endbrace - name - 2) ) { - cp = 0xFFFD; - } - if (cp > 0xff) - RExC_utf8 = 1; - if ( valuep ) { - *valuep = cp; - return NULL; - } - sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); - } else { - /* fetch the charnames handler for this scope */ - HV * const table = GvHV(PL_hintgv); - SV **cvp= table ? - hv_fetchs(table, "charnames", FALSE) : - NULL; - SV *cv= cvp ? *cvp : NULL; - HE *he_str; - int count; - /* create an SV with the name as argument */ - sv_name = newSVpvn(name, endbrace - name); - - if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { - vFAIL2("Constant(\\N{%s}) unknown: " - "(possibly a missing \"use charnames ...\")", - SvPVX(sv_name)); - } - if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */ - vFAIL2("Constant(\\N{%s}): " - "$^H{charnames} is not defined",SvPVX(sv_name)); - } - - - - if (!RExC_charnames) { - /* make sure our cache is allocated */ - RExC_charnames = newHV(); - sv_2mortal((SV*)RExC_charnames); - } - /* see if we have looked this one up before */ - he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); - if ( he_str ) { - sv_str = HeVAL(he_str); - cached = 1; - } else { - dSP ; + char * endchar = strchr(RExC_parse, '.'); + if (endchar) { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + else endchar = endbrace; - ENTER ; - SAVETMPS ; - PUSHMARK(SP) ; - - XPUSHs(sv_name); - - PUTBACK ; - - count= call_sv(cv, G_SCALAR); - - if (count == 1) { /* XXXX is this right? dmq */ - sv_str = POPs; - SvREFCNT_inc_simple_void(sv_str); - } - - SPAGAIN ; - PUTBACK ; - FREETMPS ; - LEAVE ; - - if ( !sv_str || !SvOK(sv_str) ) { - vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} " - "did not return a defined value",SvPVX(sv_name)); - } - if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0)) - cached = 1; - } + length_of_hex = (STRLEN)(endchar - RExC_parse); + *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if ( length_of_hex != (STRLEN)(endchar - RExC_parse) ) { + *valuep = UNICODE_REPLACEMENT; + } + + RExC_parse = endbrace + 1; + if (endchar == endbrace) return NULL; + + ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ } - if (valuep) { - char *p = SvPV(sv_str, len); - if (len) { - STRLEN numlen = 1; - if ( SvUTF8(sv_str) ) { - *valuep = utf8_to_uvchr((U8*)p, &numlen); - if (*valuep > 0x7F) - RExC_utf8 = 1; - /* XXXX - We have to turn on utf8 for high bit chars otherwise - we get failures with - - "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i - "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i - - This is different from what \x{} would do with the same - codepoint, where the condition is > 0xFF. - - dmq - */ - - - } else { - *valuep = (UV)*p; - /* warn if we havent used the whole string? */ - } - if (numlen 0) { - const STRLEN unilen = reguni(pRExC_state, uvc, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } else { - const STRLEN unilen = reguni(pRExC_state, uvc, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - } else { - len++; - REGC(*p, s++); - } - } - if (SIZE_ONLY) { - RExC_size += STR_SZ(len); - } else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } - Set_Node_Cur_Length(ret); /* MJD */ - RExC_parse--; - nextchar(pRExC_state); - } else { - ret = reg_node(pRExC_state,NOTHING); - } - if (!cached) { - SvREFCNT_dec(sv_str); - } - if (sv_name) { - SvREFCNT_dec(sv_name); + else { /* Not a char class */ + char *s; /* String to put in generated EXACT node */ + STRLEN len = 0; /* Its current length */ + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + + ret = reg_node(pRExC_state, + (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); + s= STRING(ret); + + /* Exact nodes can hold only a U8 length's of text = 255. Loop through + * the input which is of the form now 'c1.c2.c3...}' until find the + * ending brace or exeed length 255. The characters that exceed this + * limit are dropped. The limit could be relaxed should it become + * desirable by reparsing this as (?:\N{NAME}), so could generate + * multiple EXACT nodes, as is done for just regular input. But this + * is primarily a named character, and not intended to be a huge long + * string, so 255 bytes should be good enough */ + while (1) { + STRLEN this_char_length; + I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + UV cp; /* Ord of current character */ + + /* Code points are separated by dots. If none, there is only one + * code point, and is terminated by the brace */ + endchar = strchr(RExC_parse, '.'); + if (! endchar) endchar = endbrace; + + /* The values are Unicode even on EBCDIC machines */ + this_char_length = (STRLEN)(endchar - RExC_parse); + cp = grok_hex(RExC_parse, &this_char_length, &grok_flags, NULL); + if ( this_char_length == 0 + || this_char_length != (STRLEN)(endchar - RExC_parse) ) + { + cp = UNICODE_REPLACEMENT; /* Substitute a valid character */ + } + + if (! FOLD) { /* Not folding, just append to the string */ + STRLEN unilen; + + /* Quit before adding this character if would exceed limit */ + if (len + UNISKIP(cp) > U8_MAX) break; + + unilen = reguni(pRExC_state, cp, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } else { /* Folding, output the folded equivalent */ + STRLEN foldlen,numlen; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; + cp = toFOLD_uni(cp, tmpbuf, &foldlen); + + /* Quit before exceeding size limit */ + if (len + foldlen > U8_MAX) break; + + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) + { + cp = utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + const STRLEN unilen = reguni(pRExC_state, cp, s); + s += unilen; + len += unilen; + /* In EBCDIC the numlen and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ + } + } + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + + /* Quit if no more characters */ + if (RExC_parse >= endbrace) break; + } + + + if (SIZE_ONLY) { + if (RExC_parse < endbrace) { + ckWARNreg(RExC_parse - 1, + "Using just the first characters returned by \\N{}"); + } + + RExC_size += STR_SZ(len); + } else { + STR_LEN(ret) = len; + RExC_emit += STR_SZ(len); + } + + RExC_parse = endbrace + 1; + + *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail + with malformed in t/re/pat_advanced.t */ + RExC_parse --; + Set_Node_Cur_Length(ret); /* MJD */ + nextchar(pRExC_state); } - return ret; + return ret; } @@ -6049,12 +6836,13 @@ STATIC UV S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; - SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); - const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) - : SvPVX(sv); + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; + PERL_ARGS_ASSERT_REG_RECODE; + if (newlen) uv = SvUTF8(sv) ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) @@ -6062,8 +6850,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) if (!newlen || numlen != newlen) { uv = UNICODE_REPLACEMENT; - if (encp) - *encp = NULL; + *encp = NULL; } return uv; } @@ -6071,15 +6858,26 @@ S_reg_recode(pTHX_ const char value, SV **encp) /* - regatom - the lowest level - * - * Optimization: gobbles an entire sequence of ordinary characters so that - * it can turn them into a single node, which is smaller to store and - * faster to run. Backslashed characters are exceptions, each becoming a - * separate node; the code is simpler that way and it's not worth fixing. - * - * [Yes, it is worth fixing, some scripts can run twice the speed.] - * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes] - */ + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends can either, depending + on context. Specifically there are two seperate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. +*/ + STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { @@ -6091,8 +6889,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DEBUG_PARSE("atom"); *flagp = WORST; /* Tentatively. */ + PERL_ARGS_ASSERT_REGATOM; + tryagain: - switch (*RExC_parse) { + switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; nextchar(pRExC_state); @@ -6153,7 +6953,7 @@ tryagain: } return(NULL); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; case '|': case ')': @@ -6176,105 +6976,147 @@ tryagain: RExC_parse++; vFAIL("Quantifier follows nothing"); break; + case 0xDF: + case 0xC3: + case 0xCE: + do_foldchar: + if (!LOC && FOLD) { + U32 len,cp; + len=0; /* silence a spurious compiler warning */ + if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { + *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ + RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ + ret = reganode(pRExC_state, FOLDCHAR, cp); + Set_Node_Length(ret, 1); /* MJD */ + nextchar(pRExC_state); /* kill whitespace under /x */ + return ret; + } + } + goto outer_default; case '\\': - switch (*++RExC_parse) { + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ + switch ((U8)*++RExC_parse) { + case 0xDF: + case 0xC3: + case 0xCE: + goto do_foldchar; + /* Special Escapes */ case 'A': RExC_seen_zerolen++; ret = reg_node(pRExC_state, SBOL); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); RExC_seen |= REG_SEEN_GPOS; *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_SEEN_LOOKBEHIND; + goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ - nextchar(pRExC_state); - break; + goto finish_meta_pat; case 'z': ret = reg_node(pRExC_state, EOS); *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); RExC_seen |= REG_SEEN_CANY; *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'w': ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'W': ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 's': ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'S': ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'd': ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'D': ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'v': + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'V': + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ - break; + break; case 'p': case 'P': { char* const oldregxend = RExC_end; +#ifdef DEBUGGING char* parse_start = RExC_parse - 2; +#endif if (RExC_parse[1] == '{') { /* a lovely hack--pretend we saw [\pX] instead */ @@ -6306,85 +7148,95 @@ tryagain: } break; case 'N': - /* Handle \N{NAME} here and not below because it can be + /* Handle \N and \N{NAME} here and not below because it can be multicharacter. join_exact() will join them up later on. Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL); + ret= reg_namedseq(pRExC_state, NULL, flagp); break; case 'k': /* Handle \k and \k'NAME' */ + parse_named_seq: { char ch= RExC_parse[1]; - if (ch != '<' && ch != '\'') { - if (SIZE_ONLY) - vWARN( RExC_parse + 1, - "Possible broken named back reference treated as literal k"); - parse_start--; - goto defchar; + if (ch != '<' && ch != '\'' && ch != '{') { + RExC_parse++; + vFAIL2("Sequence %.2s... not terminated",parse_start); } else { + /* this pretty much dupes the code for (?P=...) in reg(), if + you change this make sure you change that */ char* name_start = (RExC_parse += 2); U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - ch= (ch == '<') ? '>' : '\''; - + ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) - vFAIL2("Sequence \\k%c... not terminated", - (ch == '>' ? '<' : ch)); - + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, 1, "S" ); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; ret = reganode(pRExC_state, (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), num); *flagp |= HASWIDTH; - - - if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); - ARG_SET(ret,num); - RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); - } + /* override incorrect value set in reganode MJD */ Set_Node_Offset(ret, parse_start+1); Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); - + } break; - } - case 'n': - case 'r': - case 't': - case 'f': - case 'e': - case 'a': - case 'x': - case 'c': - case '0': - goto defchar; - case 'R': + } + case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isrel=(*RExC_parse=='R'); - if (isrel) + bool isg = *RExC_parse == 'g'; + bool isrel = 0; + bool hasbrace = 0; + if (isg) { RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + if (hasbrace && !isDIGIT(*RExC_parse)) { + if (isrel) RExC_parse--; + RExC_parse -= 2; + goto parse_named_seq; + } } num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); if (isrel) { num = RExC_npar - num; if (num < 1) vFAIL("Reference to nonexistent or unclosed group"); } - if (num > 9 && num >= RExC_npar) + if (!isg && num > 9 && num >= RExC_npar) goto defchar; else { char * const parse_start = RExC_parse - 1; /* MJD */ while (isDIGIT(*RExC_parse)) RExC_parse++; - + if (parse_start == RExC_parse - 1) + vFAIL("Unterminated \\g... pattern"); + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) vFAIL("Reference to nonexistent group"); @@ -6417,14 +7269,13 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - while (RExC_parse < RExC_end && *RExC_parse != '\n') - RExC_parse++; - if (RExC_parse < RExC_end) + if ( reg_skipcomment( pRExC_state ) ) goto tryagain; } /* FALL THROUGH */ - default: { + default: + outer_default:{ register STRLEN len; register UV ender; register char *p; @@ -6448,8 +7299,13 @@ tryagain: char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); - switch (*p) { + p = regwhite( pRExC_state, p ); + switch ((U8)*p) { + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case '^': case '$': case '.': @@ -6459,28 +7315,47 @@ tryagain: case '|': goto loopdone; case '\\': - switch (*++p) { - case 'A': - case 'C': - case 'X': - case 'G': - case 'Z': - case 'z': - case 'w': - case 'W': - case 'b': - case 'B': - case 's': - case 'S': - case 'd': - case 'D': - case 'p': - case 'P': - case 'N': - case 'R': - case 'k': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + + switch ((U8)*++p) { + /* These are all the special escapes. */ + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'N': /* named char sequence */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ case 'n': ender = '\n'; p++; @@ -6544,6 +7419,18 @@ tryagain: I32 flags = 0; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); + + /* An octal above 0xff is interpreted differently + * depending on if the re is in utf8 or not. If it + * is in utf8, the value will be itself, otherwise + * it is interpreted as modulo 0x100. It has been + * decided to discourage the use of octal above the + * single-byte range. For now, warn only when + * it ends up modulo */ + if (SIZE_ONLY && ender >= 0x100 + && ! UTF && ! PL_encoding) { + ckWARNregdep(p, "Use of octal value above 377 is deprecated"); + } p += numlen; } else { @@ -6557,8 +7444,8 @@ tryagain: { SV* enc = PL_encoding; ender = reg_recode((const char)(U8)ender, &enc); - if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(p, "Invalid escape in the specified encoding"); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); RExC_utf8 = 1; } break; @@ -6567,8 +7454,8 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP)) - vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); + if (!SIZE_ONLY&& isALPHA(*p)) + ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; @@ -6584,13 +7471,13 @@ tryagain: ender = *p++; break; } - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); if (UTF && FOLD) { /* Prime the casefolded buffer. */ ender = toFOLD_uni(ender, tmpbuf, &foldlen); } - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (UTF) { @@ -6692,15 +7579,25 @@ tryagain: } STATIC char * -S_regwhite(char *p, const char *e) +S_regwhite( RExC_state_t *pRExC_state, char *p ) { + const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGWHITE; + while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { + bool ended = 0; do { - p++; - } while (p < e && *p != '\n'); + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; } else break; @@ -6724,6 +7621,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) dVAR; I32 namedclass = OOB_NAMEDCLASS; + PERL_ARGS_ASSERT_REGPPOSIXCC; + if (value == '[' && RExC_parse + 1 < RExC_end && /* I smell either [: or [= or [. -- POSIX has been here, right? */ POSIXCC(UCHARAT(RExC_parse))) { @@ -6838,6 +7737,9 @@ STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { dVAR; + + PERL_ARGS_ASSERT_CHECKPOSIXCC; + if (POSIXCC(UCHARAT(RExC_parse))) { const char *s = RExC_parse; const char c = *s++; @@ -6845,10 +7747,9 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) while (isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - if (ckWARN(WARN_REGEXP)) - vWARN3(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); /* [[=foo=]] and [[.foo.]] are still future. */ if (POSIXCC_NOTYET(c)) { @@ -6863,19 +7764,77 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } +#define _C_C_T_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + if (LOC) \ + ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ + else { \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + } \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + if (LOC) \ + ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ + else { \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + } \ + yesno = '!'; \ + what = WORD; \ + break + +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '!'; \ + what = WORD; \ + break + +/* + We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test + so that it is possible to override the option here without having to + rebuild the entire core. as we are required to do if we change regcomp.h + which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. +*/ +#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS +#define BROKEN_UNICODE_CHARCLASS_MAPPINGS +#endif + +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS +#define POSIX_CC_UNI_NAME(CCNAME) CCNAME +#else +#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME +#endif + /* parse a class specification and produce either an ANYOF node that - matches the pattern. If the pattern matches a single char only and - that char is < 256 then we produce an EXACT node instead. + matches the pattern or if the pattern matches a single char only and + that char is < 256 and we are case insensitive then we produce an + EXACT node instead. */ + STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; - register UV value = 0; register UV nextvalue; register IV prevvalue = OOB_UNICODE; register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; IV namedclass; @@ -6894,6 +7853,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -6978,6 +7939,10 @@ parseit: case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { /* We only pay attention to the first char of @@ -6986,7 +7951,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v)) { + if (reg_namedseq(pRExC_state, &v, NULL)) { goto parseit; } value= v; @@ -7080,16 +8045,16 @@ parseit: { SV* enc = PL_encoding; value = reg_recode((const char)(U8)value, &enc); - if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse, - "Invalid escape in the specified encoding"); + if (!enc && SIZE_ONLY) + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); break; } default: - if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) - vWARN2(RExC_parse, - "Unrecognized escape \\%c in character class passed through", - (int)value); + if (!SIZE_ONLY && isALPHA(value)) + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } } /* end of \blah */ @@ -7108,383 +8073,119 @@ parseit: /* a bad range like a-\d, a-[:digit:] ? */ if (range) { if (!SIZE_ONLY) { - if (ckWARN(WARN_REGEXP)) { - const int w = - RExC_parse >= rangebegin ? - RExC_parse - rangebegin : 0; - vWARN4(RExC_parse, + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); - } + if (prevvalue < 256) { ANYOF_BITMAP_SET(ret, prevvalue); ANYOF_BITMAP_SET(ret, '-'); } else { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); - } - } - - range = 0; /* this was not a true range */ - } - - if (!SIZE_ONLY) { - const char *what = NULL; - char yesno = 0; - - if (namedclass > OOB_NAMEDCLASS) - optimize_invert = FALSE; - /* Possible truncation here but in some 64-bit environments - * the compiler gets heartburn about switch on 64-bit values. - * A similar issue a little earlier when switching on value. - * --jhi */ - switch ((I32)namedclass) { - case ANYOF_ALNUM: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALNUM); - else { - for (value = 0; value < 256; value++) - if (isALNUM(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Word"; - break; - case ANYOF_NALNUM: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NALNUM); - else { - for (value = 0; value < 256; value++) - if (!isALNUM(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Word"; - break; - case ANYOF_ALNUMC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); - else { - for (value = 0; value < 256; value++) - if (isALNUMC(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Alnum"; - break; - case ANYOF_NALNUMC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NALNUMC); - else { - for (value = 0; value < 256; value++) - if (!isALNUMC(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Alnum"; - break; - case ANYOF_ALPHA: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALPHA); - else { - for (value = 0; value < 256; value++) - if (isALPHA(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Alpha"; - break; - case ANYOF_NALPHA: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NALPHA); - else { - for (value = 0; value < 256; value++) - if (!isALPHA(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Alpha"; - break; - case ANYOF_ASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ASCII); - else { -#ifndef EBCDIC - for (value = 0; value < 128; value++) - ANYOF_BITMAP_SET(ret, value); -#else /* EBCDIC */ - for (value = 0; value < 256; value++) { - if (isASCII(value)) - ANYOF_BITMAP_SET(ret, value); - } -#endif /* EBCDIC */ - } - yesno = '+'; - what = "ASCII"; - break; - case ANYOF_NASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NASCII); - else { -#ifndef EBCDIC - for (value = 128; value < 256; value++) - ANYOF_BITMAP_SET(ret, value); -#else /* EBCDIC */ - for (value = 0; value < 256; value++) { - if (!isASCII(value)) - ANYOF_BITMAP_SET(ret, value); - } -#endif /* EBCDIC */ - } - yesno = '!'; - what = "ASCII"; - break; - case ANYOF_BLANK: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_BLANK); - else { - for (value = 0; value < 256; value++) - if (isBLANK(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Blank"; - break; - case ANYOF_NBLANK: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NBLANK); - else { - for (value = 0; value < 256; value++) - if (!isBLANK(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Blank"; - break; - case ANYOF_CNTRL: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_CNTRL); - else { - for (value = 0; value < 256; value++) - if (isCNTRL(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Cntrl"; - break; - case ANYOF_NCNTRL: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NCNTRL); - else { - for (value = 0; value < 256; value++) - if (!isCNTRL(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Cntrl"; - break; - case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); - else { - /* consecutive digits assumed */ - for (value = '0'; value <= '9'; value++) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Digit"; - break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); - else { - /* consecutive digits assumed */ - for (value = 0; value < '0'; value++) - ANYOF_BITMAP_SET(ret, value); - for (value = '9' + 1; value < 256; value++) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Digit"; - break; - case ANYOF_GRAPH: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_GRAPH); - else { - for (value = 0; value < 256; value++) - if (isGRAPH(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Graph"; - break; - case ANYOF_NGRAPH: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NGRAPH); - else { - for (value = 0; value < 256; value++) - if (!isGRAPH(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Graph"; - break; - case ANYOF_LOWER: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_LOWER); - else { - for (value = 0; value < 256; value++) - if (isLOWER(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Lower"; - break; - case ANYOF_NLOWER: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NLOWER); - else { - for (value = 0; value < 256; value++) - if (!isLOWER(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Lower"; - break; - case ANYOF_PRINT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_PRINT); - else { - for (value = 0; value < 256; value++) - if (isPRINT(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Print"; - break; - case ANYOF_NPRINT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NPRINT); - else { - for (value = 0; value < 256; value++) - if (!isPRINT(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Print"; - break; - case ANYOF_PSXSPC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); - else { - for (value = 0; value < 256; value++) - if (isPSXSPC(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Space"; - break; - case ANYOF_NPSXSPC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); - else { - for (value = 0; value < 256; value++) - if (!isPSXSPC(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Space"; - break; - case ANYOF_PUNCT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_PUNCT); - else { - for (value = 0; value < 256; value++) - if (isPUNCT(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "Punct"; - break; - case ANYOF_NPUNCT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NPUNCT); - else { - for (value = 0; value < 256; value++) - if (!isPUNCT(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '!'; - what = "Punct"; - break; - case ANYOF_SPACE: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(ret, value); - } - yesno = '+'; - what = "SpacePerl"; - break; - case ANYOF_NSPACE: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NSPACE); - else { - for (value = 0; value < 256; value++) - if (!isSPACE(value)) - ANYOF_BITMAP_SET(ret, value); + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + Perl_sv_catpvf(aTHX_ listsv, + "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); } - yesno = '!'; - what = "SpacePerl"; - break; - case ANYOF_UPPER: + } + + range = 0; /* this was not a true range */ + } + + + + if (!SIZE_ONLY) { + const char *what = NULL; + char yesno = 0; + + if (namedclass > OOB_NAMEDCLASS) + optimize_invert = FALSE; + /* Possible truncation here but in some 64-bit environments + * the compiler gets heartburn about switch on 64-bit values. + * A similar issue a little earlier when switching on value. + * --jhi */ + switch ((I32)namedclass) { + + case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); + case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); + case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); + case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); + case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); + case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); + case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); + case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); + case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); + case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS + case _C_C_T_(ALNUM, isALNUM(value), "Word"); + case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); +#else + case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); +#endif + case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); + case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); + case ANYOF_ASCII: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_UPPER); + ANYOF_CLASS_SET(ret, ANYOF_ASCII); else { - for (value = 0; value < 256; value++) - if (isUPPER(value)) - ANYOF_BITMAP_SET(ret, value); +#ifndef EBCDIC + for (value = 0; value < 128; value++) + ANYOF_BITMAP_SET(ret, value); +#else /* EBCDIC */ + for (value = 0; value < 256; value++) { + if (isASCII(value)) + ANYOF_BITMAP_SET(ret, value); + } +#endif /* EBCDIC */ } yesno = '+'; - what = "Upper"; + what = "ASCII"; break; - case ANYOF_NUPPER: + case ANYOF_NASCII: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NUPPER); + ANYOF_CLASS_SET(ret, ANYOF_NASCII); else { - for (value = 0; value < 256; value++) - if (!isUPPER(value)) - ANYOF_BITMAP_SET(ret, value); +#ifndef EBCDIC + for (value = 128; value < 256; value++) + ANYOF_BITMAP_SET(ret, value); +#else /* EBCDIC */ + for (value = 0; value < 256; value++) { + if (!isASCII(value)) + ANYOF_BITMAP_SET(ret, value); + } +#endif /* EBCDIC */ } yesno = '!'; - what = "Upper"; - break; - case ANYOF_XDIGIT: + what = "ASCII"; + break; + case ANYOF_DIGIT: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_XDIGIT); + ANYOF_CLASS_SET(ret, ANYOF_DIGIT); else { - for (value = 0; value < 256; value++) - if (isXDIGIT(value)) - ANYOF_BITMAP_SET(ret, value); + /* consecutive digits assumed */ + for (value = '0'; value <= '9'; value++) + ANYOF_BITMAP_SET(ret, value); } yesno = '+'; - what = "XDigit"; + what = POSIX_CC_UNI_NAME("Digit"); break; - case ANYOF_NXDIGIT: + case ANYOF_NDIGIT: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT); + ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); else { - for (value = 0; value < 256; value++) - if (!isXDIGIT(value)) - ANYOF_BITMAP_SET(ret, value); + /* consecutive digits assumed */ + for (value = 0; value < '0'; value++) + ANYOF_BITMAP_SET(ret, value); + for (value = '9' + 1; value < 256; value++) + ANYOF_BITMAP_SET(ret, value); } yesno = '!'; - what = "XDigit"; - break; + what = POSIX_CC_UNI_NAME("Digit"); + break; case ANYOF_MAX: /* this is to handle \p and \P */ break; @@ -7548,12 +8249,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -7618,8 +8323,8 @@ parseit: if (!unicode_alternate) unicode_alternate = newAV(); - sv = newSVpvn((char*)foldbuf, foldlen); - SvUTF8_on(sv); + sv = newSVpvn_utf8((char*)foldbuf, foldlen, + TRUE); av_push(unicode_alternate, sv); } } @@ -7665,7 +8370,7 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ - if( stored == 1 && value < 256 + if( stored == 1 && (value < 128 || (value < 256 && !UTF)) && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) ) { /* optimize single char class to an EXACT node @@ -7679,6 +8384,7 @@ parseit: *STRING(ret)= (char)value; STR_LEN(ret)= 1; RExC_emit += STR_SZ(1); + SvREFCNT_dec(listsv); return ret; } /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ @@ -7715,20 +8421,70 @@ parseit: * used later (regexec.c:S_reginclass()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); - av_store(av, 2, (SV*)unicode_alternate); - rv = newRV_noinc((SV*)av); + av_store(av, 2, MUTABLE_SV(unicode_alternate)); + rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(ret, n); } return ret; } +#undef _C_C_T_ + + +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advance that parse position, and optionall absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { char* const retval = RExC_parse++; + PERL_ARGS_ASSERT_NEXTCHAR; + for (;;) { if (*RExC_parse == '(' && RExC_parse[1] == '?' && RExC_parse[2] == '#') { @@ -7746,9 +8502,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } else if (*RExC_parse == '#') { - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') break; - continue; + if ( reg_skipcomment( pRExC_state ) ) + continue; } } return retval; @@ -7766,23 +8521,24 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REG_NODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 1; return(ret); } -#ifdef DEBUGGING - if (OP(RExC_emit) == 255) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ", - reg_name[op], OP(RExC_emit)); -#endif + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), @@ -7790,7 +8546,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } - +#endif RExC_emit = ptr; return(ret); } @@ -7806,6 +8562,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGANODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; @@ -7825,18 +8583,18 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) */ return(ret); } -#ifdef DEBUGGING - if (OP(RExC_emit) == 255) - Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space"); -#endif + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), @@ -7844,7 +8602,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } - +#endif RExC_emit = ptr; return(ret); } @@ -7856,6 +8614,9 @@ STATIC STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; + + PERL_ARGS_ASSERT_REGUNI; + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); } @@ -7874,8 +8635,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ - DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]); + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); if (SIZE_ONLY) { RExC_size += size; return; @@ -7886,30 +8650,31 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) dst = RExC_emit; if (RExC_open_parens) { int paren; - DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar); + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ for ( paren=0 ; paren < RExC_npar ; paren++ ) { if ( RExC_open_parens[paren] >= opnd ) { - DEBUG_PARSE_FMT("open"," - %d",size); + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ RExC_open_parens[paren] += size; } else { - DEBUG_PARSE_FMT("open"," - %s","ok"); + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ } if ( RExC_close_parens[paren] >= opnd ) { - DEBUG_PARSE_FMT("close"," - %d",size); + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ RExC_close_parens[paren] += size; } else { - DEBUG_PARSE_FMT("close"," - %s","ok"); + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ } } } while (src > opnd) { StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(dst - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(src - RExC_emit_start), @@ -7918,15 +8683,17 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } +#endif } place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(place - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), @@ -7935,6 +8702,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -7951,6 +8719,8 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de dVAR; register regnode *scan; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -7969,7 +8739,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), - (temp == NULL ? reg_name[OP(val)] : "") + (temp == NULL ? PL_reg_name[OP(val)] : "") ); }); if (temp == NULL) @@ -8011,9 +8781,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; #endif - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGTAIL_STUDY; + if (SIZE_ONLY) return exact; @@ -8050,7 +8821,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), - reg_name[exact]); + PL_reg_name[exact]); }); if (temp == NULL) break; @@ -8080,9 +8851,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, /* - regcurly - a little FSA that accepts {\d+,?\d*} */ -STATIC I32 -S_regcurly(register const char *s) +I32 +Perl_regcurly(register const char *s) { + PERL_ARGS_ASSERT_REGCURLY; + if (*s++ != '{') return FALSE; if (!isDIGIT(*s)) @@ -8102,6 +8875,29 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +static void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8183,7 +8982,9 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else + PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); #endif /* DEBUGGING */ @@ -8192,6 +8993,17 @@ Perl_regdump(pTHX_ const regexp *r) /* - regprop - printable representation of opcode */ +#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ +STMT_START { \ + if (do_sep) { \ + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ + if (flags & ANYOF_INVERT) \ + /*make sure the invert info is in each */ \ + sv_catpvs(sv, "^"); \ + do_sep = 0; \ + } \ +} STMT_END + void Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { @@ -8201,43 +9013,42 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGPROP; - sv_setpvn(sv, "", 0); + sv_setpvs(sv, ""); if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); - sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */ + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; if (k == EXACT) { - SV * const dsv = sv_2mortal(newSVpvs("")); + sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since * we have no flag "this EXACTish node was UTF-8" * --jhi */ - const char * const s = - pv_pretty(dsv, STRING(o), STR_LEN(o), 60, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | - PERL_PV_PRETTY_LTGT - ); - Perl_sv_catpvf(aTHX_ sv, " %s", s ); + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); } else if (k == TRIE) { /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ const char op = OP(o); - const I32 n = ARG(o); + const U32 n = ARG(o); const reg_ac_data * const ac = IS_TRIE_AC(op) ? (reg_ac_data *)progi->data->data[n] : NULL; const reg_trie_data * const trie = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]); + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( Perl_sv_catpvf(aTHX_ sv, "", @@ -8254,7 +9065,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int i; int rangestart = -1; U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); - Perl_sv_catpvf(aTHX_ sv, "["); + sv_catpvs(sv, "["); for (i = 0; i <= 256; i++) { if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) @@ -8271,7 +9082,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) rangestart = -1; } } - Perl_sv_catpvf(aTHX_ sv, "]"); + sv_catpvs(sv, "]"); } } else if (k == CURLY) { @@ -8281,19 +9092,44 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) + else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ - else if (k == GOSUB) + if ( RXp_PAREN_NAMES(prog) ) { + if ( k != REF || OP(o) < NREF) { + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; nflags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - (SV*)progi->data->data[ ARG( o ) ]); + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == FOLDCHAR) + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ static const char * const anyofs[] = { @@ -8309,8 +9145,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^alpha:]", "[:ascii:]", "[:^ascii:]", - "[:ctrl:]", - "[:^ctrl:]", + "[:cntrl:]", + "[:^cntrl:]", "[:graph:]", "[:^graph:]", "[:lower:]", @@ -8336,6 +9172,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { if (rangestart == -1) @@ -8349,15 +9187,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "-"); put_byte(sv, i - 1); } + do_sep = 1; rangestart = -1; } } - + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + /* output any special charclass tests (used mostly under use locale) */ if (o->flags & ANYOF_CLASS) for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) + if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); - + do_sep = 1; + } + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + + /* output information about the unicode matching */ if (flags & ANYOF_UNICODE) sv_catpvs(sv, "{unicode}"); else if (flags & ANYOF_UNICODE_ALL) @@ -8370,7 +9216,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (lv) { if (sw) { U8 s[UTF8_MAXBYTES_CASE+1]; - + for (i = 0; i <= 256; i++) { /* just the first 256 */ uvchr_to_utf8(s, i); @@ -8441,10 +9287,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ regexp *prog) +Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; + struct regexp *const prog = (struct regexp *)SvANY(r); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; PERL_UNUSED_CONTEXT; DEBUG_COMPILE_r( @@ -8480,41 +9329,103 @@ Perl_re_intuit_string(pTHX_ regexp *prog) */ #ifndef PERL_IN_XSUB_RE void -Perl_pregfree(pTHX_ struct regexp *r) +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); GET_RE_DEBUG_FLAGS_DECL; - if (!r || (--r->refcnt > 0)) - return; - - CALLREGFREE_PVT(r); /* free the private data */ - - /* gcov results gave these as non-null 100% of the time, so there's no - optimisation in checking them before calling Safefree */ - Safefree(r->precomp); - RX_MATCH_COPY_FREE(r); + PERL_ARGS_ASSERT_PREGFREE2; + + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); + } + if (r->substrs) { + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } + RX_MATCH_COPY_FREE(rx); #ifdef PERL_OLD_COPY_ON_WRITE - if (r->saved_copy) - SvREFCNT_dec(r->saved_copy); + SvREFCNT_dec(r->saved_copy); #endif + Safefree(r->offs); +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesnt actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = (struct regexp *)SvANY(rx); + register const I32 npar = r->nparens+1; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + ret = (struct regexp *)SvANY(ret_x); + + (void)ReREFCNT_inc(rx); + /* We can take advantage of the existing "copied buffer" mechanism in SVs + by pointing directly at the buffer, but flagging that the allocated + space in the copy is zero. As we've just done a struct copy, it's now + a case of zero-ing that, rather than copying the current length. */ + SvPV_set(ret_x, RX_WRAPPED(rx)); + SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + SvLEN_set(ret_x, 0); + SvSTASH_set(ret_x, NULL); + SvMAGIC_set(ret_x, NULL); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); if (r->substrs) { - if (r->anchored_substr) - SvREFCNT_dec(r->anchored_substr); - if (r->anchored_utf8) - SvREFCNT_dec(r->anchored_utf8); - if (r->float_substr) - SvREFCNT_dec(r->float_substr); - if (r->float_utf8) - SvREFCNT_dec(r->float_utf8); - Safefree(r->substrs); + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ } - if (r->paren_names) - SvREFCNT_dec(r->paren_names); + RX_MATCH_COPIED_off(ret_x); +#ifdef PERL_OLD_COPY_ON_WRITE + ret->saved_copy = NULL; +#endif + ret->mother_re = rx; - Safefree(r->startp); - Safefree(r->endp); - Safefree(r); + return ret_x; } #endif @@ -8531,25 +9442,30 @@ Perl_pregfree(pTHX_ struct regexp *r) */ void -Perl_regfree_internal(pTHX_ struct regexp *r) +Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + DEBUG_COMPILE_r({ if (!PL_colorset) reginitcolors(); { SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, r->precomp, r->prelen, 60); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } }); - - Safefree(ri->offsets); /* 20010421 MJD */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif if (ri->data) { int n = ri->data->count; PAD* new_comppad = NULL; @@ -8562,13 +9478,13 @@ Perl_regfree_internal(pTHX_ struct regexp *r) case 's': case 'S': case 'u': - SvREFCNT_dec((SV*)ri->data->data[n]); + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); break; case 'f': Safefree(ri->data->data[n]); break; case 'p': - new_comppad = (AV*)ri->data->data[n]; + new_comppad = MUTABLE_AV(ri->data->data[n]); break; case 'o': if (new_comppad == NULL) @@ -8584,7 +9500,7 @@ Perl_regfree_internal(pTHX_ struct regexp *r) op_free((OP_4tree*)ri->data->data[n]); PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec((SV*)new_comppad); + SvREFCNT_dec(MUTABLE_SV(new_comppad)); new_comppad = NULL; break; case 'n': @@ -8638,26 +9554,21 @@ Perl_regfree_internal(pTHX_ struct regexp *r) Safefree(ri->data->what); Safefree(ri->data); } - if (ri->swap) { - Safefree(ri->swap->startp); - Safefree(ri->swap->endp); - Safefree(ri->swap); - } + Safefree(ri); } #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) -#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t)) -#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t)) +#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t))) +#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t))) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* - regdupe - duplicate a regexp. - - This routine is called by sv.c's re_dup and is expected to clone a - given regexp structure. It is a no-op when not under USE_ITHREADS. - (Originally this *was* re_dup() for change history see sv.c) + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + After all of the core data stored in struct regexp is duplicated the regexp_engine.dupe method is used to copy any private data stored in the *pprivate pointer. This allows extensions to handle @@ -8667,68 +9578,90 @@ Perl_regfree_internal(pTHX_ struct regexp *r) */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE -regexp * -Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; - regexp *ret; - int i, npar; - struct reg_substr_datum *s; - - if (!r) - return (REGEXP *)NULL; - - if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) - return ret; - + I32 npar; + const struct regexp *r = (const struct regexp *)SvANY(sstr); + struct regexp *ret = (struct regexp *)SvANY(dstr); + PERL_ARGS_ASSERT_RE_DUP_GUTS; + npar = r->nparens+1; - Newxz(ret, 1, regexp); - Newx(ret->startp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - Newx(ret->endp, npar, I32); - Copy(r->endp, ret->endp, npar, I32); - - Newx(ret->substrs, 1, struct reg_substr_data); - for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { - s->min_offset = r->substrs->data[i].min_offset; - s->max_offset = r->substrs->data[i].max_offset; - s->end_shift = r->substrs->data[i].end_shift; - s->substr = sv_dup_inc(r->substrs->data[i].substr, param); - s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if(ret->swap) { + /* no need to copy these */ + Newx(ret->swap, npar, regexp_paren_pair); } - - ret->precomp = SAVEPVN(r->precomp, r->prelen); - ret->refcnt = r->refcnt; - ret->minlen = r->minlen; - ret->minlenret = r->minlenret; - ret->prelen = r->prelen; - ret->nparens = r->nparens; - ret->lastparen = r->lastparen; - ret->lastcloseparen = r->lastcloseparen; - ret->intflags = r->intflags; - ret->extflags = r->extflags; + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } - ret->sublen = r->sublen; + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); - ret->engine = r->engine; - - ret->paren_names = hv_dup_inc(r->paren_names, param); + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); - if (RX_MATCH_COPIED(ret)) - ret->subbeg = SAVEPVN(r->subbeg, r->sublen); + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - - ret->pprivate = r->pprivate; - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); - - ptr_table_store(PL_ptr_table, r, ret); - return ret; + + if (ret->mother_re) { + if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { + /* Our storage points directly to our mother regexp, but that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + /* Note we need to sue SvCUR() on our mother_re, because it, in + turn, may well be pointing to its own mother_re. */ + SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), + SvCUR(ret->mother_re)+1)); + SvLEN_set(dstr, SvCUR(ret->mother_re)+1); + } + ret->mother_re = NULL; + } + ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -8747,30 +9680,25 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); regexp_internal *reti; int len, npar; RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; npar = r->nparens+1; - len = ri->offsets[0]; + len = ProgLen(ri); - Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); - if(ri->swap) { - Newx(reti->swap, 1, regexp_paren_ofs); - /* no need to copy these */ - Newx(reti->swap->startp, npar, I32); - Newx(reti->swap->endp, npar, I32); - } else { - reti->swap = NULL; - } - reti->regstclass = NULL; + if (ri->data) { struct reg_data *d; const int count = ri->data->count; @@ -8790,7 +9718,7 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) case 'S': case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ - d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param); + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; case 'f': /* This is cheating. */ @@ -8831,117 +9759,23 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) else reti->data = NULL; - Newx(reti->offsets, 2*len+1, U32); - Copy(ri->offsets, reti->offsets, 2*len+1, U32); - + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + return (void*)reti; } #endif /* USE_ITHREADS */ -/* - reg_stringify() - - converts a regexp embedded in a MAGIC struct to its stringified form, - caching the converted form in the struct and returns the cached - string. - - If lp is nonnull then it is used to return the length of the - resulting string - - If flags is nonnull and the returned string contains UTF8 then - (*flags & 1) will be true. - - If haseval is nonnull then it is used to return whether the pattern - contains evals. - - Normally called via macro: - - CALLREG_STRINGIFY(mg,&len,&utf8); - - And internally with - - CALLREG_AS_STR(mg,&lp,&flags,&haseval) - - See sv_2pv_flags() in sv.c for an example of internal usage. - - */ #ifndef PERL_IN_XSUB_RE -char * -Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { - dVAR; - const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = "msix"; - char reflags[6]; - char ch; - int left = 0; - int right = 4; - bool need_newline = 0; - U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12); - - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(left != 4) { - reflags[left] = '-'; - left = 5; - } - - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->extflags) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } - if (haseval) - *haseval = re->seen_evals; - if (flags) - *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; -} /* - regnext - dig the "next" pointer out of a node @@ -8973,6 +9807,8 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) SV *msv; const char *message; + PERL_ARGS_ASSERT_RE_CROAK2; + if (l1 > 510) l1 = 510; if (l1 + l2 > 510) @@ -9033,7 +9869,7 @@ Perl_save_re_context(pTHX) const REGEXP * const rx = PM_GETRE(PL_curpm); if (rx) { U32 i; - for (i = 1; i <= rx->nparens; i++) { + for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); GV *const *const gvp @@ -9054,7 +9890,7 @@ static void clear_re(pTHX_ void *r) { dVAR; - ReREFCNT_dec((regexp *)r); + ReREFCNT_dec((REGEXP *)r); } #ifdef DEBUGGING @@ -9062,12 +9898,26 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + PERL_ARGS_ASSERT_PUT_BYTE; + + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); - else if (c == '-' || c == ']' || c == '\\' || c == '^') - Perl_sv_catpvf(aTHX_ sv, "\\%c", c); - else - Perl_sv_catpvf(aTHX_ sv, "%c", c); + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } } @@ -9088,9 +9938,12 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, register U8 op = PSEUDO; /* Arbitrary non-END op. */ register const regnode *next; const regnode *optstart= NULL; + RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_DUMPUNTIL; + #ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); @@ -9101,13 +9954,12 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, while (PL_regkind[op] != END && (!last || node < last)) { /* While that wasn't END last time... */ - NODE_ALIGN(node); op = OP(node); if (op == CLOSE || op == WHILEM) indent--; next = regnext((regnode *)node); - + /* Where, what. */ if (OP(node) == OPTIMIZED) { if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) @@ -9116,23 +9968,21 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, goto after_print; } else CLEAR_OPTSTART; - + regprop(r, sv, node); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); - - if (OP(node) != OPTIMIZED) { - if (next == NULL) /* Next ptr. */ - PerlIO_printf(Perl_debug_log, "(0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) - PerlIO_printf(Perl_debug_log, "(FAIL)"); - else - PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); - - /*if (PL_regkind[(U8)op] != TRIE)*/ - (void)PerlIO_putc(Perl_debug_log, '\n'); - } - + + if (OP(node) != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, " (0)"); + else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, " (FAIL)"); + else + PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); + (void)PerlIO_putc(Perl_debug_log, '\n'); + } + after_print: if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); @@ -9152,18 +10002,18 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, else if ( PL_regkind[(U8)op] == TRIE ) { const regnode *this_trie = node; const char op = OP(node); - const I32 n = ARG(node); + const U32 n = ARG(node); const reg_ac_data * const ac = op>=AHOCORASICK ? (reg_ac_data *)ri->data->data[n] : NULL; const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET]; + AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; - sv_setpvn(sv, "", 0); + sv_setpvs(sv, ""); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); @@ -9172,7 +10022,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"