X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/37820adc4aeb8cb209843cf0abf3b24c8e5b59e8..0399b2152e23eb6ce1f09562d53b87be7fe30924:/regcomp.c diff --git a/regcomp.c b/regcomp.c index be5acdb..cbba23d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -85,6 +85,8 @@ # include "regcomp.h" #endif +#include "dquote_static.c" + #ifdef op #undef op #endif /* op */ @@ -196,7 +198,10 @@ typedef struct RExC_state_t { */ #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ + +/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single + * character, and if utf8, must be invariant. */ +#define SIMPLE 0x02 #define SPSTART 0x04 /* Starts with * or +. */ #define TRYAGAIN 0x08 /* Weeded out a declaration. */ #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ @@ -218,6 +223,11 @@ typedef struct RExC_state_t { #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) +/* If not already in utf8, do a longjmp back to the beginning */ +#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */ +#define REQUIRE_UTF8 STMT_START { \ + if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \ + } STMT_END /* About scan_data_t. @@ -358,9 +368,10 @@ static const scan_data_t zero_scan_data = #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ #define SCF_SEEN_ACCEPT 0x8000 -#define UTF (RExC_utf8 != 0) -#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0) -#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0) +#define UTF cBOOL(RExC_utf8) +#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE) +#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE) +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) #define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 @@ -870,7 +881,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con Dumps the final compressed table form of the trie to Perl_debug_log. Used for debugging make_trie(). */ - + STATIC void S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, AV *revcharmap, U32 depth) @@ -3116,7 +3127,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } flags &= ~SCF_DO_STCLASS; } - else if (strchr((const char*)PL_varies,OP(scan))) { + else if (REGNODE_VARIES(OP(scan))) { I32 mincount, maxcount, minnext, deltanext, fl = 0; I32 f = flags, pos_before = 0; regnode * const oscan = scan; @@ -3196,7 +3207,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* These are the cases when once a subexpression fails at a particular position, it cannot succeed even after backtracking at the enclosing scope. - + XXXX what if minimal match and we are at the initial run of {n,m}? */ if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) @@ -3268,7 +3279,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Skip open. */ nxt = regnext(nxt); - if (!strchr((const char*)PL_simple,OP(nxt)) + if (!REGNODE_SIMPLE(OP(nxt)) && !(PL_regkind[OP(nxt)] == EXACT && STR_LEN(nxt) == 1)) goto nogo; @@ -3289,11 +3300,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING OP(nxt1 + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ - NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ OP(nxt + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ #endif } nogo: @@ -3316,12 +3327,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, nxt = nxt2; OP(nxt2) = SUCCEED; /* Whas WHILEM */ /* Need to optimize away parenths. */ - if (data->flags & SF_IN_PAR) { + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { /* Set the parenth number. */ regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ - if (OP(nxt) != CLOSE) - FAIL("Panic opt close"); oscan->flags = (U8)ARG(nxt); if (RExC_open_parens) { RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ @@ -3339,7 +3348,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #if 0 while ( nxt1 && (OP(nxt1) != WHILEM)) { regnode *nnxt = regnext(nxt1); - if (nnxt == nxt) { if (reg_off_by_arg[OP(nxt1)]) ARG_SET(nxt1, nxt2 - nxt1); @@ -3406,7 +3414,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; - l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); @@ -3494,13 +3501,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { for (value = 0; value < 256; value++) if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - else { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { for (value = 0; value < 256; value++) if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + ANYOF_BITMAP_SET(data->start_class, value); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; @@ -3513,7 +3520,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->pos_delta += 1; data->longest = &(data->longest_float); } - } else if (OP(scan) == FOLDCHAR) { int d = ARG(scan)==0xDF ? 1 : 2; @@ -3527,7 +3533,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (strchr((const char*)PL_simple,OP(scan))) { + else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; if (flags & SCF_DO_SUBSTR) { @@ -3571,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); - for (value = 0; value < 256; value++) - if (!isALNUM(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (!isWORDCHAR_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (!isALNUM(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); - else { - for (value = 0; value < 256; value++) - if (isALNUM(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + else if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isWORDCHAR_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isALNUM(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } } break; case ALNUML: @@ -3600,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); - for (value = 0; value < 256; value++) - if (isALNUM(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isWORDCHAR_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isALNUM(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { @@ -3611,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { for (value = 0; value < 256; value++) if (!isALNUM(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -3629,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); - for (value = 0; value < 256; value++) - if (!isSPACE(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (!isSPACE_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (!isSPACE(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { - if (data->start_class->flags & ANYOF_LOCALE) + if (data->start_class->flags & ANYOF_LOCALE) { ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(data->start_class, value); + } + else if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isSPACE_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isSPACE(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } } } break; @@ -3658,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isSPACE_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isSPACE(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); - else { - for (value = 0; value < 256; value++) - if (!isSPACE(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + else if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (!isSPACE_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + else { + for (value = 0; value < 256; value++) { + if (!isSPACE(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } } break; case NSPACEL: @@ -3700,7 +3772,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { for (value = 0; value < 256; value++) if (isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -3717,7 +3789,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { for (value = 0; value < 256; value++) if (!isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -4264,18 +4336,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) struct regexp *r; register regexp_internal *ri; STRLEN plen; - char *exp = SvPV(pattern, plen); - char* xend = exp + plen; + char *exp; + char* xend; regnode *scan; I32 flags; I32 minlen = 0; I32 sawplus = 0; I32 sawopen = 0; + U8 jump_ret = 0; + dJMPENV; scan_data_t data; RExC_state_t RExC_state; RExC_state_t * const pRExC_state = &RExC_state; #ifdef TRIE_STUDY_OPT - int restudied= 0; + int restudied; RExC_state_t copyRExC_state; #endif GET_RE_DEBUG_FLAGS_DECL; @@ -4286,15 +4360,56 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); - DEBUG_COMPILE_r({ - SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, exp, plen, 60); - PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", - PL_colors[4],PL_colors[5],s); - }); -redo_first_pass: + /* Longjmp back to here if have to switch in midstream to utf8 */ + if (! RExC_orig_utf8) { + JMPENV_PUSH(jump_ret); + } + + if (jump_ret == 0) { /* First time through */ + exp = SvPV(pattern, plen); + xend = exp + plen; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, + dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + } + else { /* longjumped back */ + STRLEN len = plen; + + /* If the cause for the longjmp was other than changing to utf8, pop + * our own setjmp, and longjmp to the correct handler */ + if (jump_ret != UTF8_LONGJMP) { + JMPENV_POP; + JMPENV_JUMP(jump_ret); + } + + GET_RE_DEBUG_FLAGS; + + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8 = 1; + SAVEFREEPV(exp); + } + +#ifdef TRIE_STUDY_OPT + restudied = 0; +#endif + RExC_precomp = exp; RExC_flags = pm_flags; RExC_sawback = 0; @@ -4333,24 +4448,14 @@ redo_first_pass: RExC_precomp = NULL; return(NULL); } - if (RExC_utf8 && !RExC_orig_utf8) { - /* It's possible to write a regexp in ascii that represents Unicode - codepoints outside of the byte range, such as via \x{100}. If we - detect such a sequence we have to convert the entire pattern to utf8 - and then recompile, as our sizing calculation will have been based - on 1 byte == 1 character, but we will need to use utf8 to encode - at least some part of the pattern, and therefore must convert the whole - thing. - XXX: somehow figure out how to make this less expensive... - -- dmq */ - STRLEN len = plen; - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, - "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); - xend = exp + len; - RExC_orig_utf8 = RExC_utf8; - SAVEFREEPV(exp); - goto redo_first_pass; + + /* Here, finished first pass. Get rid of our setjmp, which we added for + * efficiency only if the passed-in string wasn't in utf8, as shown by + * RExC_orig_utf8. But if the first pass was redone, that variable will be + * 1 here even though the original string wasn't utf8, but in this case + * there will have been a long jump */ + if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) { + JMPENV_POP; } DEBUG_PARSE_r({ PerlIO_printf(Perl_debug_log, @@ -4392,39 +4497,56 @@ redo_first_pass: r->extflags = pm_flags; { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE)); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - const STRLEN wraplen = plen + has_minus + has_p + has_runon + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + has_charset /* If needs a character set specifier */ + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - p = sv_grow(MUTABLE_SV(rx), wraplen + 1); - SvCUR_set(rx, wraplen); + p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ SvPOK_on(rx); SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + if (r->extflags & RXf_PMf_LOCALE) { + *p++ = LOCALE_PAT_MOD; + } else { + *p++ = UNICODE_PAT_MOD; + } + } if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { - char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; - char *colon = r + 1; char ch; - while((ch = *fptr++)) { if(reganch & 1) *p++ = ch; - else - *r-- = ch; reganch >>= 1; } - if(has_minus) { - *r = '-'; - p = colon; - } } *p++ = ':'; @@ -4436,6 +4558,7 @@ redo_first_pass: *p++ = '\n'; *p++ = ')'; *p = 0; + SvCUR_set(rx, p - SvPVX_const(rx)); } r->intflags = 0; @@ -4604,7 +4727,7 @@ reStudy: ri->regstclass = trie_op; } #endif - else if (strchr((const char*)PL_simple,OP(first))) + else if (REGNODE_SIMPLE(OP(first))) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOUND || PL_regkind[OP(first)] == NBOUND) @@ -4945,7 +5068,7 @@ reStudy: #endif #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "p" ); + ri->name_list_idx = add_data( pRExC_state, 1, "a" ); ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif @@ -4995,7 +5118,7 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, if (flags & RXapif_FETCH) { return reg_named_buff_fetch(rx, key, flags); } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { - Perl_croak(aTHX_ "%s", PL_no_modify); + Perl_croak_no_modify(aTHX); return NULL; } else if (flags & RXapif_EXISTS) { return reg_named_buff_exists(rx, key, flags) @@ -5295,7 +5418,7 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, PERL_UNUSED_ARG(value); if (!PL_localizing) - Perl_croak(aTHX_ "%s", PL_no_modify); + Perl_croak_no_modify(aTHX); } I32 @@ -5636,6 +5759,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; + bool has_use_defaults = FALSE; RExC_parse++; paren = *RExC_parse++; @@ -5765,7 +5889,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; case '=': /* (?=...) */ RExC_seen_zerolen++; - break; + break; case '!': /* (?!...) */ RExC_seen_zerolen++; if (*RExC_parse == ')') { @@ -6090,12 +6214,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); break; + case DEFAULT_PAT_MOD: /* Use default flags with the exceptions + that follow */ + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE); + goto parse_flags; default: --RExC_parse; parse_flags: /* (?i) */ { U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; + bool has_charset_modifier = 0; while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -6103,6 +6234,32 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case LOCALE_PAT_MOD: + if (has_charset_modifier || flagsp == &negflags) { + goto fail_modifiers; + } + posflags |= RXf_PMf_LOCALE; + negflags |= RXf_PMf_UNICODE; + has_charset_modifier = 1; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier || flagsp == &negflags) { + goto fail_modifiers; + } + posflags |= RXf_PMf_UNICODE; + negflags |= RXf_PMf_LOCALE; + has_charset_modifier = 1; + break; + case DUAL_PAT_MOD: + if (has_use_defaults + || has_charset_modifier + || flagsp == &negflags) + { + goto fail_modifiers; + } + negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); + has_charset_modifier = 1; + break; case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { @@ -6143,7 +6300,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; case '-': - if (flagsp == &negflags) { + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + fail_modifiers: RExC_parse++; vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); /*NOTREACHED*/ @@ -6746,7 +6907,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) return (regnode *) &RExC_parse; /* Invalid regnode pointer */ } - RExC_utf8 = 1; /* named sequences imply Unicode semantics */ + REQUIRE_UTF8; /* named sequences imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ if (valuep) { /* In a bracketed char class */ @@ -6791,7 +6952,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) } else { /* Not a char class */ char *s; /* String to put in generated EXACT node */ - STRLEN len = 0; /* Its current length */ + STRLEN len = 0; /* Its current byte length */ char *endchar; /* Points to '.' or '}' ending cur char in the input stream */ @@ -6801,7 +6962,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) /* Exact nodes can hold only a U8 length's of text = 255. Loop through * the input which is of the form now 'c1.c2.c3...}' until find the - * ending brace or exeed length 255. The characters that exceed this + * ending brace or exceed length 255. The characters that exceed this * limit are dropped. The limit could be relaxed should it become * desirable by reparsing this as (?:\N{NAME}), so could generate * multiple EXACT nodes, as is done for just regular input. But this @@ -7134,31 +7295,61 @@ tryagain: *flagp |= HASWIDTH; goto finish_meta_pat; case 'w': - ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(ALNUML)); + } else { + ret = reg_node(pRExC_state, (U8)(ALNUM)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'W': - ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NALNUML)); + } else { + ret = reg_node(pRExC_state, (U8)(NALNUM)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(BOUNDL)); + } else { + ret = reg_node(pRExC_state, (U8)(BOUND)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= SIMPLE; goto finish_meta_pat; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NBOUNDL)); + } else { + ret = reg_node(pRExC_state, (U8)(NBOUND)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= SIMPLE; goto finish_meta_pat; case 's': - ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(SPACEL)); + } else { + ret = reg_node(pRExC_state, (U8)(SPACE)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'S': - ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NSPACEL)); + } else { + ret = reg_node(pRExC_state, (U8)(NSPACE)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'd': @@ -7462,6 +7653,35 @@ tryagain: ender = ASCII_TO_NATIVE('\007'); p++; break; + case 'o': + { + STRLEN brace_len = len; + UV result; + const char* error_msg; + + bool valid = grok_bslash_o(p, + &result, + &brace_len, + &error_msg, + 1); + p += brace_len; + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + else + { + ender = result; + } + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } case 'x': if (*++p == '{') { char* const e = strchr(p, '}'); @@ -7476,7 +7696,7 @@ tryagain: STRLEN numlen = e - p - 1; ender = grok_hex(p + 1, &numlen, &flags, NULL); if (ender > 0xff) - RExC_utf8 = 1; + REQUIRE_UTF8; p = e + 1; } } @@ -7496,21 +7716,13 @@ tryagain: case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { - I32 flags = 0; + (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); - - /* An octal above 0xff is interpreted differently - * depending on if the re is in utf8 or not. If it - * is in utf8, the value will be itself, otherwise - * it is interpreted as modulo 0x100. It has been - * decided to discourage the use of octal above the - * single-byte range. For now, warn only when - * it ends up modulo */ - if (SIZE_ONLY && ender >= 0x100 - && ! UTF && ! PL_encoding) { - ckWARNregdep(p, "Use of octal value above 377 is deprecated"); + if (ender > 0xff) { + REQUIRE_UTF8; } p += numlen; } @@ -7527,7 +7739,7 @@ tryagain: ender = reg_recode((const char)(U8)ender, &enc); if (!enc && SIZE_ONLY) ckWARNreg(p, "Invalid escape in the specified encoding"); - RExC_utf8 = 1; + REQUIRE_UTF8; } break; case '\0': @@ -7844,43 +8056,55 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } - -#define _C_C_T_(NAME,TEST,WORD) \ +/* No locale test */ +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ ANYOF_##NAME: \ - if (LOC) \ - ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else { \ for (value = 0; value < 256; value++) \ if (TEST) \ ANYOF_BITMAP_SET(ret, value); \ - } \ yesno = '+'; \ what = WORD; \ break; \ case ANYOF_N##NAME: \ - if (LOC) \ - ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else { \ for (value = 0; value < 256; value++) \ if (!TEST) \ ANYOF_BITMAP_SET(ret, value); \ - } \ yesno = '!'; \ what = WORD; \ break -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +/* Like the above, but there are differences if we are in uni-8-bit or not, so + * there are two tests passed in, to use depending on that. There aren't any + * cases where the label is different from the name, so no need for that + * parameter */ +#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \ ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - ANYOF_BITMAP_SET(ret, value); \ + if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ + else if (UNI_SEMANTICS) { \ + for (value = 0; value < 256; value++) { \ + if (TEST_8) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ + else { \ + for (value = 0; value < 256; value++) { \ + if (TEST_7) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ yesno = '+'; \ what = WORD; \ break; \ case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - ANYOF_BITMAP_SET(ret, value); \ + if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ + else if (UNI_SEMANTICS) { \ + for (value = 0; value < 256; value++) { \ + if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ + else { \ + for (value = 0; value < 256; value++) { \ + if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ yesno = '!'; \ what = WORD; \ break @@ -7928,7 +8152,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) #ifdef EBCDIC UV literal_endpoint = 0; #endif - UV stored = 0; /* number of chars stored in the class */ + UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */ regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ @@ -8086,6 +8310,24 @@ parseit: case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + { + const char* error_msg; + bool valid = grok_bslash_o(RExC_parse, + &value, + &numlen, + &error_msg, + SIZE_ONLY); + RExC_parse += numlen; + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; case 'x': if (*RExC_parse == '{') { I32 flags = PERL_SCAN_ALLOW_UNDERSCORES @@ -8111,9 +8353,10 @@ parseit: value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': + case '5': case '6': case '7': { - I32 flags = 0; + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; numlen = 3; value = grok_oct(--RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; @@ -8131,10 +8374,12 @@ parseit: break; } default: - if (!SIZE_ONLY && isALPHA(value)) + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isALNUM(value) && value != '_') { ckWARN2reg(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + } break; } } /* end of \blah */ @@ -8145,10 +8390,23 @@ parseit: if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ - if (!SIZE_ONLY && !need_class) - ANYOF_CLASS_ZERO(ret); - - need_class = 1; + /* What matches in a locale is not known until runtime, so need to + * (one time per class) allocate extra space to pass to regexec. + * The space will contain a bit for each named class that is to be + * matched against. This isn't needed for \p{} and pseudo-classes, + * as they are not affected by locale, and hence are dealt with + * separately */ + if (LOC && namedclass < ANYOF_MAX && ! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_CLASS_ADD_SKIP; + } + else { + RExC_emit += ANYOF_CLASS_ADD_SKIP; + ANYOF_CLASS_ZERO(ret); + } + ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE; + } /* a bad range like a-\d, a-[:digit:] ? */ if (range) { @@ -8188,24 +8446,26 @@ parseit: * --jhi */ switch ((I32)namedclass) { - case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); - case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); - case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); - case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); - case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); - case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); - case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); - case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); - case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); - case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); + case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum"); + case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha"); + case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank"); + case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl"); + case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph"); + case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower"); + case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint"); + case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace"); + case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct"); + case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper"); #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS - case _C_C_T_(ALNUM, isALNUM(value), "Word"); - case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); + /* \s, \w match all unicode if utf8. */ + case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl"); + case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word"); #else - case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); - case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); + /* \s, \w match ascii and locale only */ + case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord"); #endif - case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: @@ -8277,8 +8537,7 @@ parseit: /* Strings such as "+utf8::isWord\n" */ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); } - if (LOC) - ANYOF_FLAGS(ret) |= ANYOF_CLASS; + stored+=2; /* can't optimize this class */ continue; } } /* end of namedclass \blah */ @@ -8437,13 +8696,6 @@ parseit: range = 0; /* this range (if it was one) is done now */ } - if (need_class) { - ANYOF_FLAGS(ret) |= ANYOF_LARGE; - if (SIZE_ONLY) - RExC_size += ANYOF_CLASS_ADD_SKIP; - else - RExC_emit += ANYOF_CLASS_ADD_SKIP; - } if (SIZE_ONLY) @@ -8547,7 +8799,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) /* nextchar() - Advance that parse position, and optionall absorbs + Advance that parse position, and optionally absorbs "whitespace" from the inputstream. Without /x "whitespace" means (?#...) style comments only, @@ -8929,31 +9181,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, #endif /* - - regcurly - a little FSA that accepts {\d+,?\d*} - */ -#ifndef PERL_IN_XSUB_RE -I32 -Perl_regcurly(register const char *s) -{ - PERL_ARGS_ASSERT_REGCURLY; - - if (*s++ != '{') - return FALSE; - if (!isDIGIT(*s)) - return FALSE; - while (isDIGIT(*s)) - s++; - if (*s == ',') - s++; - while (isDIGIT(*s)) - s++; - if (*s != '}') - return FALSE; - return TRUE; -} -#endif - -/* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING @@ -9556,6 +9783,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ switch (ri->data->what[n]) { + case 'a': case 's': case 'S': case 'u': @@ -9636,9 +9864,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) Safefree(ri); } -#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) -#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t))) -#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t))) +#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) +#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* @@ -9790,8 +10017,9 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = ri->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sSfpontTu + /* legal options are one of: sSfpontTua see also regcomp.h and pregfree() */ + case 'a': /* actually an AV, but the dup function is identical. */ case 's': case 'S': case 'p': /* actually an AV, but the dup function is identical. */ @@ -9867,6 +10095,10 @@ Perl_regnext(pTHX_ register regnode *p) if (!p) return(NULL); + if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + } + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); if (offset == 0) return(NULL);