X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/cfba2ecca3ba2c255c4533aa7cc149abdeea3ec0..556a074941f5c3a4ba6613f10be6c56138989a11:/regcomp.c diff --git a/regcomp.c b/regcomp.c index d515d1c..6ce4996 100644 --- a/regcomp.c +++ b/regcomp.c @@ -850,7 +850,8 @@ static const scan_data_t zero_scan_data = { #define UPDATE_WARNINGS_LOC(loc) \ STMT_START { \ if (TO_OUTPUT_WARNINGS(loc)) { \ - RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \ + RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ + - RExC_precomp; \ } \ } STMT_END @@ -2701,7 +2702,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, #endif switch (flags) { - case EXACT: case EXACT_ONLY8: case EXACTL: break; + case EXACT: case EXACT_REQ8: case EXACTL: break; case EXACTFAA: case EXACTFUP: case EXACTFU: @@ -2716,7 +2717,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL) + if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -2792,8 +2793,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( noper < tail && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_ONLY8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 || OP(noper) == EXACTFUP)))) { uc= (U8*)STRING(noper); @@ -3010,8 +3011,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( noper < tail && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_ONLY8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 || OP(noper) == EXACTFUP)))) { const U8 *uc= (U8*)STRING(noper); @@ -3235,8 +3236,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( noper < tail && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_ONLY8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 || OP(noper) == EXACTFUP)))) { const U8 *uc= (U8*)STRING(noper); @@ -3551,9 +3552,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( state==1 ) { OP( convert ) = nodetype; str=STRING(convert); - STR_LEN(convert)=0; + setSTR_LEN(convert, 0); } - STR_LEN(convert) += len; + setSTR_LEN(convert, STR_LEN(convert) + len); while (len--) *str++ = *ch++; } else { @@ -3993,8 +3994,9 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * using /iaa matching will be doing so almost entirely with ASCII * strings, so this should rarely be encountered in practice */ -#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ - if (PL_regkind[OP(scan)] == EXACT) \ +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ + if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \ + && OP(scan) != LEXACT_REQ8) \ join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1) STATIC U32 @@ -4059,16 +4061,16 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, /* Joining something that requires UTF-8 with something that * doesn't, means the result requires UTF-8. */ - if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) { - OP(scan) = EXACT_ONLY8; + if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { + OP(scan) = EXACT_REQ8; } - else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) { + else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { ; /* join is compatible, no need to change OP */ } - else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) { - OP(scan) = EXACTFU_ONLY8; + else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { + OP(scan) = EXACTFU_REQ8; } - else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) { + else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { ; /* join is compatible, no need to change OP */ } else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { @@ -4091,7 +4093,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * node. And if the only adjacent node is EXACTF, they get * absorbed into that, under the theory that a longer node is * better than two shorter ones, even if one is EXACTFU. Note - * that EXACTFU_ONLY8 is generated only for UTF-8 patterns, + * that EXACTFU_REQ8 is generated only for UTF-8 patterns, * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ if (STRING(n)[STR_LEN(n)-1] == 's') { @@ -4160,7 +4162,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, merged++; NEXT_OFF(scan) += NEXT_OFF(n); - STR_LEN(scan) += STR_LEN(n); + setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n)); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); @@ -4199,7 +4201,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this final joining, sequences could have been split over boundaries, and * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ - if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) { + if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { U8* s0 = (U8*) STRING(scan); U8* s = s0; U8* s_end = s0 + STR_LEN(scan); @@ -4542,31 +4544,31 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, */ JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); - /* Follow the next-chain of the current node and optimize - away all the NOTHINGs from it. */ - if (OP(scan) != CURLYX) { - const int max = (reg_off_by_arg[OP(scan)] - ? I32_MAX - /* I32 may be smaller than U16 on CRAYs! */ - : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); - int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); - int noff; - regnode *n = scan; - - /* Skip NOTHING and LONGJMP. */ - while ((n = regnext(n)) - && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) - || ((OP(n) == LONGJMP) && (noff = ARG(n)))) - && off + noff < max) - off += noff; - if (reg_off_by_arg[OP(scan)]) - ARG(scan) = off; - else - NEXT_OFF(scan) = off; - } + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ( (n = regnext(n)) + && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } - /* The principal pseudo-switch. Cannot be a switch, since we - look into several different things. */ + /* The principal pseudo-switch. Cannot be a switch, since we look into + * several different things. */ if ( OP(scan) == DEFINEP ) { SSize_t minlen = 0; SSize_t deltanext = 0; @@ -4858,9 +4860,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ----------------+----------- NOTHING | NOTHING EXACT | EXACT - EXACT_ONLY8 | EXACT + EXACT_REQ8 | EXACT EXACTFU | EXACTFU - EXACTFU_ONLY8 | EXACTFU + EXACTFU_REQ8 | EXACTFU EXACTFUP | EXACTFU EXACTFAA | EXACTFAA EXACTL | EXACTL @@ -4870,10 +4872,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ ? NOTHING \ - : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \ + : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ ? EXACT \ : ( EXACTFU == (X) \ - || EXACTFU_ONLY8 == (X) \ + || EXACTFU_REQ8 == (X) \ || EXACTFUP == (X) ) \ ? EXACTFU \ : ( EXACTFAA == (X) ) \ @@ -5197,7 +5199,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else if ( OP(scan) == EXACT - || OP(scan) == EXACT_ONLY8 + || OP(scan) == LEXACT + || OP(scan) == EXACT_REQ8 + || OP(scan) == LEXACT_REQ8 || OP(scan) == EXACTL) { SSize_t l = STR_LEN(scan); @@ -5282,7 +5286,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (flags & SCF_DO_STCLASS) { - SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan); + SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); assert(EXACTF_invlist); if (flags & SCF_DO_STCLASS_AND) { @@ -5319,7 +5323,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); if ( OP(next) == EXACT - || OP(next) == EXACT_ONLY8 + || OP(next) == LEXACT + || OP(next) == EXACT_REQ8 + || OP(next) == LEXACT_REQ8 || OP(next) == EXACTL || (flags & SCF_DO_STCLASS)) { @@ -6171,7 +6177,6 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", } #endif } - else if (OP(scan) == OPEN) { if (stopparen != (I32)ARG(scan)) pars++; @@ -7584,6 +7589,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && memEQ(RX_PRECOMP(old_re), exp, plen) && !runtime_code /* with runtime code, always recompile */ ) { + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len); + Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n", + PL_colors[4], PL_colors[5], s); + }); return old_re; } @@ -7972,7 +7983,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Ignore EXACT as we deal with it later. */ if (PL_regkind[OP(first)] == EXACT) { if ( OP(first) == EXACT - || OP(first) == EXACT_ONLY8 + || OP(first) == LEXACT + || OP(first) == EXACT_REQ8 + || OP(first) == LEXACT_REQ8 || OP(first) == EXACTL) { NOOP; /* Empty, get anchored substr later. */ @@ -8318,7 +8331,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && nop == END) RExC_rx->extflags |= RXf_WHITE; else if ( RExC_rx->extflags & RXf_SPLIT - && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL) + && ( fop == EXACT || fop == LEXACT + || fop == EXACT_REQ8 || fop == LEXACT_REQ8 + || fop == EXACTL) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && nop == END ) @@ -9016,23 +9031,6 @@ S__invlist_array_init(SV* const invlist, const bool will_have_0) return zero_addr + *offset; } -PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) -{ - /* Sets the current number of elements stored in the inversion list. - * Updates SvCUR correspondingly */ - PERL_UNUSED_CONTEXT; - PERL_ARGS_ASSERT_INVLIST_SET_LEN; - - assert(is_invlist(invlist)); - - SvCUR_set(invlist, - (len == 0) - ? 0 - : TO_INTERNAL_SIZE(len + offset)); - assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); -} - STATIC void S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) { @@ -9183,6 +9181,7 @@ S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) invlist_iterfinish(invlist); *get_invlist_previous_index_addr(invlist) = 0; + SvPOK_on(invlist); /* This allows B to extract the PV */ } SV* @@ -9257,25 +9256,12 @@ Perl__new_invlist_C_array(pTHX_ const UV* const list) invlist_iterfinish(invlist); SvREADONLY_on(invlist); + SvPOK_on(invlist); return invlist; } STATIC void -S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) -{ - /* Grow the maximum size of an inversion list */ - - PERL_ARGS_ASSERT_INVLIST_EXTEND; - - assert(is_invlist(invlist)); - - /* Add one to account for the zero element at the beginning which may not - * be counted by the calling parameters */ - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); -} - -STATIC void S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) { @@ -10261,11 +10247,6 @@ Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, #endif -PERL_STATIC_INLINE SV* -S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { - return _add_range_to_invlist(invlist, cp, cp); -} - #ifndef PERL_IN_XSUB_RE void Perl__invlist_invert(pTHX_ SV* const invlist) @@ -10316,108 +10297,6 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) #endif -PERL_STATIC_INLINE STRLEN* -S_get_invlist_iter_addr(SV* invlist) -{ - /* Return the address of the UV that contains the current iteration - * position */ - - PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - - assert(is_invlist(invlist)); - - return &(((XINVLIST*) SvANY(invlist))->iterator); -} - -PERL_STATIC_INLINE void -S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ -{ - PERL_ARGS_ASSERT_INVLIST_ITERINIT; - - *get_invlist_iter_addr(invlist) = 0; -} - -PERL_STATIC_INLINE void -S_invlist_iterfinish(SV* invlist) -{ - /* Terminate iterator for invlist. This is to catch development errors. - * Any iteration that is interrupted before completed should call this - * function. Functions that add code points anywhere else but to the end - * of an inversion list assert that they are not in the middle of an - * iteration. If they were, the addition would make the iteration - * problematical: if the iteration hadn't reached the place where things - * were being added, it would be ok */ - - PERL_ARGS_ASSERT_INVLIST_ITERFINISH; - - *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; -} - -STATIC bool -S_invlist_iternext(SV* invlist, UV* start, UV* end) -{ - /* An C call on must be used to set this up. - * This call sets in <*start> and <*end>, the next range in . - * Returns if successful and the next call will return the next - * range; if was already at the end of the list. If the latter, - * <*start> and <*end> are unchanged, and the next call to this function - * will start over at the beginning of the list */ - - STRLEN* pos = get_invlist_iter_addr(invlist); - UV len = _invlist_len(invlist); - UV *array; - - PERL_ARGS_ASSERT_INVLIST_ITERNEXT; - - if (*pos >= len) { - *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ - return FALSE; - } - - array = invlist_array(invlist); - - *start = array[(*pos)++]; - - if (*pos >= len) { - *end = UV_MAX; - } - else { - *end = array[(*pos)++] - 1; - } - - return TRUE; -} - -PERL_STATIC_INLINE UV -S_invlist_highest(SV* const invlist) -{ - /* Returns the highest code point that matches an inversion list. This API - * has an ambiguity, as it returns 0 under either the highest is actually - * 0, or if the list is empty. If this distinction matters to you, check - * for emptiness before calling this function */ - - UV len = _invlist_len(invlist); - UV *array; - - PERL_ARGS_ASSERT_INVLIST_HIGHEST; - - if (len == 0) { - return 0; - } - - array = invlist_array(invlist); - - /* The last element in the array in the inversion list always starts a - * range that goes to infinity. That range may be for code points that are - * matched in the inversion list, or it may be for ones that aren't - * matched. In the latter case, the highest code point in the set is one - * less than the beginning of this range; otherwise it is the final element - * of this range: infinity */ - return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) - ? UV_MAX - : array[len - 1] - 1; -} - STATIC SV * S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) { @@ -10596,7 +10475,7 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) * call SvREFCNT_dec() when done with it. */ STATIC SV* -S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) +S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) { dVAR; const U8 * s = (U8*)STRING(node); @@ -10605,7 +10484,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) /* Start out big enough for 2 separate code points */ SV* invlist = _new_invlist(4); - PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST; + PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; if (! UTF) { uc = *s; @@ -11079,7 +10958,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REG; DEBUG_PARSE("reg "); - max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD); assert(max_open); if (!SvIOK(max_open)) { @@ -11312,12 +11190,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) goto parse_rest; } - /* By doing this here, we avoid extra warnings for nested - * script runs */ - ckWARNexperimental(RExC_parse, - WARN_EXPERIMENTAL__SCRIPT_RUN, - "The script_run feature is experimental"); - if (paren == 's') { /* Here, we're starting a new regular script run */ ret = reg_node(pRExC_state, SROPEN); @@ -11362,9 +11234,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /*FALLTHROUGH*/ alpha_assertions: - ckWARNexperimental(RExC_parse, - WARN_EXPERIMENTAL__ALPHA_ASSERTIONS, - "The alpha_assertions feature is experimental"); RExC_seen_zerolen++; @@ -12067,16 +11936,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) goto parse_rest; } /* end switch */ } - else { - if (*RExC_parse == '{') { - ckWARNregdep(RExC_parse + 1, - "Unescaped left brace in regex is " - "deprecated here (and will be fatal " - "in Perl 5.32), passed through"); - } - /* Not bothering to indent here, as the above 'else' is temporary - * */ - if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ + else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ capturing_parens: parno = RExC_npar; RExC_npar++; @@ -12143,7 +12003,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ paren = ':'; ret = 0; - } } } else /* ! paren */ @@ -13916,13 +13775,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) UV ender = 0; char *p; char *s; - -/* This allows us to fill a node with just enough spare so that if the final - * character folds, its expansion is guaranteed to fit */ -#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE) - char *s0; - U8 upper_parse = MAX_NODE_STRING_SIZE; + U32 max_string_len = 255; + + /* We may have to reparse the node, artificially stopping filling + * it early, based on info gleaned in the first parse. This + * variable gives where we stop. Make it above the normal stopping + * place first time through; otherwise it would stop too early */ + U32 upper_fill = max_string_len + 1; /* We start out as an EXACT node, even if under /i, until we find a * character which is in a fold. The algorithm now segregates into @@ -13936,12 +13796,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) U8 node_type = EXACT; /* Assume the node will be fully used; the excess is given back at - * the end. We can't make any other length assumptions, as a byte - * input sequence could shrink down. */ - Ptrdiff_t initial_size = STR_SZ(256); + * the end. Under /i, leave enough extra room so that we won't + * overflow the buffer when we fold a character which would end up + * overflowing the node. We can't make any other length + * assumptions, as a byte input sequence could shrink down. */ + Ptrdiff_t current_string_nodes = STR_SZ(max_string_len + + ((! FOLD) + ? 0 + : 1 * ((UTF) + ? UTF8_MAXBYTES_CASE + /* Max non-UTF-8 expansion is 2 */ : 2))); bool next_is_quantifier; char * oldp = NULL; + char * old_oldp = NULL; /* We can convert EXACTF nodes to EXACTFU if they contain only * characters that match identically regardless of the target @@ -13969,10 +13837,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* So is the MICRO SIGN */ bool has_micro_sign = FALSE; + /* Set when we fill up the current node and there is still more + * text to process */ + bool overflowed; + /* Allocate an EXACT node. The node_type may change below to * another EXACTish node, but since the size of the node doesn't * change, it works */ - ret = regnode_guts(pRExC_state, node_type, initial_size, "exact"); + ret = regnode_guts(pRExC_state, node_type, current_string_nodes, + "exact"); FILL_NODE(ret, node_type); RExC_emit++; @@ -13982,6 +13855,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reparse: + p = RExC_parse; + len = 0; + s = s0; + node_type = EXACT; + oldp = NULL; + maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); + maybe_SIMPLE = SIMPLE; + requires_utf8_target = FALSE; + has_ss = FALSE; + has_micro_sign = FALSE; + + continue_parse: + /* This breaks under rare circumstances. If folding, we do not * want to split a node at a character that is a non-final in a * multi-char fold, as an input string could just happen to want to @@ -13996,17 +13882,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) || UTF8_IS_START(UCHARAT(RExC_parse))); + overflowed = FALSE; + /* Here, we have a literal character. Find the maximal string of * them in the input that we can fit into a single EXACTish node. * We quit at the first non-literal or when the node gets full, or * under /i the categorization of folding/non-folding character * changes */ - for (p = RExC_parse; len < upper_parse && p < RExC_end; ) { + while (p < RExC_end && len < upper_fill) { /* In most cases each iteration adds one byte to the output. * The exceptions override this */ Size_t added_len = 1; + old_oldp = oldp; oldp = p; /* White space has already been ignored */ @@ -14339,20 +14228,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Ready to add 'ender' to the node */ if (! FOLD) { /* The simple case, just append the literal */ + not_fold_common: - not_fold_common: - if (UVCHR_IS_INVARIANT(ender) || ! UTF) { - *(s++) = (char) ender; - } - else { - U8 * new_s = uvchr_to_utf8((U8*)s, ender); - added_len = (char *) new_s - s; - s = (char *) new_s; + /* Don't output if it would overflow */ + if (UNLIKELY(len > max_string_len - ((UTF) + ? UVCHR_SKIP(ender) + : 1))) + { + overflowed = TRUE; + break; + } - if (ender > 255) { - requires_utf8_target = TRUE; - } + if (UVCHR_IS_INVARIANT(ender) || ! UTF) { + *(s++) = (char) ender; + } + else { + U8 * new_s = uvchr_to_utf8((U8*)s, ender); + added_len = (char *) new_s - s; + s = (char *) new_s; + + if (ender > 255) { + requires_utf8_target = TRUE; } + } } else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { @@ -14416,22 +14314,35 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } - if (UTF) { /* Use the folded value */ + if (UTF) { /* Alway use the folded value for UTF-8 + patterns */ if (UVCHR_IS_INVARIANT(ender)) { + if (UNLIKELY(len + 1 > max_string_len)) { + overflowed = TRUE; + break; + } + *(s)++ = (U8) toFOLD(ender); } else { - ender = _to_uni_fold_flags( + UV folded = _to_uni_fold_flags( ender, - (U8 *) s, + (U8 *) s, /* We have allocated extra space + in 's' so can't run off the + end */ &added_len, FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (UNLIKELY(len + added_len > max_string_len)) { + overflowed = TRUE; + break; + } + s += added_len; - if ( ender > 255 - && LIKELY(ender != GREEK_SMALL_LETTER_MU)) + if ( folded > 255 + && LIKELY(folded != GREEK_SMALL_LETTER_MU)) { /* U+B5 folds to the MU, so its possible for a * non-UTF-8 target to match it */ @@ -14439,63 +14350,77 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } } - else { - - /* Here is non-UTF8. First, see if the character's - * fold differs between /d and /u. */ - if (PL_fold[ender] != PL_fold_latin1[ender]) { - maybe_exactfu = FALSE; + else { /* Here is non-UTF8. */ + + /* The fold will be one or (rarely) two characters. + * Check that there's room for at least a single one + * before setting any flags, etc. Because otherwise an + * overflowing character could cause a flag to be set + * even though it doesn't end up in this node. (For + * the two character fold, we check again, before + * setting any flags) */ + if (UNLIKELY(len + 1 > max_string_len)) { + overflowed = TRUE; + break; } #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ || UNICODE_DOT_DOT_VERSION > 0) - /* On non-ancient Unicode versions, this includes the - * multi-char fold SHARP S to 'ss' */ - - if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S) - || ( isALPHA_FOLD_EQ(ender, 's') - && len > 0 - && isALPHA_FOLD_EQ(*(s-1), 's'))) - { - /* Here, we have one of the following: - * a) a SHARP S. This folds to 'ss' only under - * /u rules. If we are in that situation, - * fold the SHARP S to 'ss'. See the comments - * for join_exact() as to why we fold this - * non-UTF at compile time, and no others. - * b) 'ss'. When under /u, there's nothing - * special needed to be done here. The - * previous iteration handled the first 's', - * and this iteration will handle the second. - * If, on the otherhand it's not /u, we have - * to exclude the possibility of moving to /u, - * so that we won't generate an unwanted - * match, unless, at runtime, the target - * string is in UTF-8. - * */ + /* On non-ancient Unicodes, check for the only possible + * multi-char fold */ + if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { + /* This potential multi-char fold means the node + * can't be simple (because it could match more + * than a single char). And in some cases it will + * match 'ss', so set that flag */ + maybe_SIMPLE = 0; has_ss = TRUE; - maybe_exactfu = FALSE; /* Can't generate an - EXACTFU node (unless we - already are in one) */ - if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { - maybe_SIMPLE = 0; - if (node_type == EXACTFU) { - *(s++) = 's'; - - /* Let the code below add in the extra 's' */ - ender = 's'; - added_len = 2; + + /* It can't change to be an EXACTFU (unless already + * is one). We fold it iff under /u rules. */ + if (node_type != EXACTFU) { + maybe_exactfu = FALSE; + } + else { + if (UNLIKELY(len + 2 > max_string_len)) { + overflowed = TRUE; + break; } + + *(s++) = 's'; + *(s++) = 's'; + added_len = 2; + + goto done_with_this_char; } } + else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's')) + && LIKELY(len > 0) + && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's'))) + { + /* Also, the sequence 'ss' is special when not + * under /u. If the target string is UTF-8, it + * should match SHARP S; otherwise it won't. So, + * here we have to exclude the possibility of this + * node moving to /u.*/ + has_ss = TRUE; + maybe_exactfu = FALSE; + } #endif + /* Here, the fold will be a single character */ - else if (UNLIKELY(ender == MICRO_SIGN)) { + if (UNLIKELY(ender == MICRO_SIGN)) { has_micro_sign = TRUE; } + else if (PL_fold[ender] != PL_fold_latin1[ender]) { + + /* If the character's fold differs between /d and + * /u, this can't change to be an EXACTFU node */ + maybe_exactfu = FALSE; + } *(s++) = (DEPENDS_SEMANTICS) ? (char) toFOLD(ender) @@ -14510,6 +14435,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } /* End of adding current character to the node */ + done_with_this_char: + len += added_len; if (next_is_quantifier) { @@ -14521,168 +14448,287 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } /* End of loop through literal characters */ - /* Here we have either exhausted the input or ran out of room in - * the node. (If we encountered a character that can't be in the - * node, transfer is made directly to , and so we - * wouldn't have fallen off the end of the loop.) In the latter - * case, we artificially have to split the node into two, because - * we just don't have enough space to hold everything. This - * creates a problem if the final character participates in a - * multi-character fold in the non-final position, as a match that - * should have occurred won't, due to the way nodes are matched, - * and our artificial boundary. So back off until we find a non- - * problematic character -- one that isn't at the beginning or - * middle of such a fold. (Either it doesn't participate in any - * folds, or appears only in the final position of all the folds it - * does participate in.) A better solution with far fewer false - * positives, and that would fill the nodes more completely, would - * be to actually have available all the multi-character folds to - * test against, and to back-off only far enough to be sure that - * this node isn't ending with a partial one. is set - * further below (if we need to reparse the node) to include just - * up through that final non-problematic character that this code - * identifies, so when it is set to less than the full node, we can - * skip the rest of this */ - if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { - PERL_UINT_FAST8_T backup_count = 0; - - const STRLEN full_len = len; - - assert(len >= MAX_NODE_STRING_SIZE); - - /* Here, points to just beyond where we have output the - * final character of the node. Look backwards through the - * string until find a non- problematic character */ - - if (! UTF) { - - /* This has no multi-char folds to non-UTF characters */ - if (ASCII_FOLD_RESTRICTED) { - goto loopdone; - } + /* Here we have either exhausted the input or run out of room in + * the node. If the former, we are done. (If we encountered a + * character that can't be in the node, transfer is made directly + * to , and so we wouldn't have fallen off the end of the + * loop.) */ + if (LIKELY(! overflowed)) { + goto loopdone; + } + + /* Here we have run out of room. We can grow plain EXACT and + * LEXACT nodes. If the pattern is gigantic enough, though, + * eventually we'll have to artificially chunk the pattern into + * multiple nodes. */ + if (! LOC && (node_type == EXACT || node_type == LEXACT)) { + Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))]; + Size_t overhead_expansion = 0; + char temp[256]; + Size_t max_nodes_for_string; + Size_t achievable; + SSize_t delta; + + /* Here we couldn't fit the final character in the current + * node, so it will have to be reparsed, no matter what else we + * do */ + p = oldp; + + /* If would have overflowed a regular EXACT node, switch + * instead to an LEXACT. The code below is structured so that + * the actual growing code is common to changing from an EXACT + * or just increasing the LEXACT size. This means that we have + * to save the string in the EXACT case before growing, and + * then copy it afterwards to its new location */ + if (node_type == EXACT) { + overhead_expansion = regarglen[LEXACT] - regarglen[EXACT]; + RExC_emit += overhead_expansion; + Copy(s0, temp, len, char); + } + + /* Ready to grow. If it was a plain EXACT, the string was + * saved, and the first few bytes of it overwritten by adding + * an argument field. We assume, as we do elsewhere in this + * file, that one byte of remaining input will translate into + * one byte of output, and if that's too small, we grow again, + * if too large the excess memory is freed at the end */ + + max_nodes_for_string = U16_MAX - overhead - overhead_expansion; + achievable = MIN(max_nodes_for_string, + current_string_nodes + STR_SZ(RExC_end - p)); + delta = achievable - current_string_nodes; + + /* If there is just no more room, go finish up this chunk of + * the pattern. */ + if (delta <= 0) { + goto loopdone; + } - while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { - backup_count++; - } - len = s - s0 + 1; - } + change_engine_size(pRExC_state, delta + overhead_expansion); + current_string_nodes += delta; + max_string_len + = sizeof(struct regnode) * current_string_nodes; + upper_fill = max_string_len + 1; + + /* If the length was small, we know this was originally an + * EXACT node now converted to LEXACT, and the string has to be + * restored. Otherwise the string was untouched. 260 is just + * a number safely above 255 so don't have to worry about + * getting it precise */ + if (len < 260) { + node_type = LEXACT; + FILL_NODE(ret, node_type); + s0 = STRING(REGNODE_p(ret)); + Copy(temp, s0, len, char); + s = s0 + len; + } + + goto continue_parse; + } + else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8 + locale, and prepare for that? */ + + /* Here is /i. Running out of room creates a problem if we are + * folding, and the split happens in the middle of a + * multi-character fold, as a match that should have occurred, + * won't, due to the way nodes are matched, and our artificial + * boundary. So back off until we aren't splitting such a + * fold. If there is no such place to back off to, we end up + * taking the entire node as-is. This can happen if the node + * consists entirely of 'f' or entirely of 's' characters (or + * things that fold to them) as 'ff' and 'ss' are + * multi-character folds. + * + * At this point: + * old_oldp points to the beginning in the input of the + * penultimate character in the node. + * oldp points to the beginning in the input of the + * final character in the node. + * p points to the beginning in the input of the + * next character in the input, the one that won't + * fit in the node. + * + * We aren't in the middle of a multi-char fold unless the + * final character in the node can appear in a non-final + * position in such a fold. Very few characters actually + * participate in multi-character folds, and fewer still can be + * in the non-final position. But it's complicated to know + * here if that final character is folded or not, so skip this + * check */ + + /* Make sure enough space for final char of node, + * first char of following node, and the fold of the + * following char (so we don't have to worry about + * that fold running off the end */ + U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1]; + STRLEN fold_len; + UV folded; + char * const sav_oldp = oldp; + + assert(FOLD); + + /* The Unicode standard says that multi character folds consist + * of either two or three characters. So we create a buffer + * containing a window of three. The first is the final + * character in the node (folded), and then the two that begin + * the following node. But if the first character of the + * following node can't be in a non-final fold position, there + * is no need to look at its successor character. The macros + * used below to check for multi character folds require folded + * inputs, so we have to fold these. (The fold of p was likely + * calculated in the loop above, but it hasn't beeen saved, and + * khw thinks it would be too entangled to change to do so) */ + + if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) { + folded = _to_uni_fold_flags(ender, + foldbuf, + &fold_len, + FOLD_FLAGS_FULL); + } else { + foldbuf[0] = folded = MICRO_SIGN; + fold_len = 1; + } + + /* Here, foldbuf contains the fold of the first character in + * the next node. We may also need the next one (if there is + * one) to get our third, but if the first character folded to + * more than one, those extra one(s) will serve as the third. + * Also, we don't need a third unless the previous one can + * appear in a non-final position in a fold */ + if ( ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1)) + && (fold_len == 1 || ( UTF + && UVCHR_SKIP(folded) == fold_len)) + && UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded))) + { + if (UTF) { + STRLEN next_fold_len; + + toFOLD_utf8_safe((U8*) p + UTF8SKIP(p), + (U8*) RExC_end, foldbuf + fold_len, + &next_fold_len); + fold_len += next_fold_len; + } + else { + if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) { + foldbuf[fold_len] = 's'; + } + else { + foldbuf[fold_len] = toLOWER_L1(p[1]); + } + fold_len++; + } + } - /* Point to the first byte of the final character */ - s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); + /* Here foldbuf contains the the fold of p, and if appropriate + * that of the character following p in the input. */ - while (s >= s0) { /* Search backwards until find - a non-problematic char */ - if (UTF8_IS_INVARIANT(*s)) { + /* Search backwards until find a place that doesn't split a + * multi-char fold */ + while (1) { + STRLEN s_len; + char s_fold_buf[UTF8_MAXBYTES_CASE]; + char * s_fold = s_fold_buf; - /* There are no ascii characters that participate - * in multi-char folds under /aa. In EBCDIC, the - * non-ascii invariants are all control characters, - * so don't ever participate in any folds. */ - if (ASCII_FOLD_RESTRICTED - || ! IS_NON_FINAL_FOLD(*s)) - { - break; - } + if (s <= s0) { + + /* There's no safe place in the node to split. Quit so + * will take the whole node */ + oldp = sav_oldp; + break; + } + + /* Backup 1 character. The first time through this moves s + * to point to the final character in the node */ + if (UTF) { + s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); + } + else { + s--; + } + + /* 's' may or may not be folded; so make sure it is, and + * use just the final character in its fold (should there + * be more than one */ + if (UTF) { + toFOLD_utf8_safe((U8*) s, + (U8*) s + UTF8SKIP(s), + (U8 *) s_fold_buf, &s_len); + while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len) + { + s_fold += UTF8SKIP(s_fold); } - else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( - *s, *(s+1)))) - { + s_len = UTF8SKIP(s_fold); + } + else { + if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S)) + { + s_fold_buf[0] = 's'; + } + else { /* This works for all other non-UTF-8 folds + */ + s_fold_buf[0] = toLOWER_L1(UCHARAT(s)); + } + s_len = 1; + } + + /* Unshift this character to the beginning of the buffer, + * No longer needed trailing characters are overwritten. + * */ + Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8); + Copy(s_fold, foldbuf, s_len, U8); + + /* If this isn't a multi-character fold, we have found a + * splittable place. If this is the final character in the + * node, that means the node is valid as-is, and can quit. + * Otherwise, we note how much we can fill the node before + * coming to a non-splittable position, and go parse it + * again, stopping there. This is done because we know + * where in the output to stop, but we don't have a map to + * where that is in the input. One could be created, but + * it seems like overkill for such a rare event as we are + * dealing with here */ + if (UTF) { + if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf, + foldbuf + UTF8_MAXBYTES_CASE)) + { + upper_fill = s + UTF8SKIP(s) - s0; + if (LIKELY(oldp)) { break; } + goto reparse; } - else if (! _invlist_contains_cp( - PL_NonFinalFold, - valid_utf8_to_uvchr((U8 *) s, NULL))) - { + } + else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf, + foldbuf + UTF8_MAXBYTES_CASE)) + { + upper_fill = s + 1 - s0; + if (LIKELY(oldp)) { break; } - - /* Here, the current character is problematic in that - * it does occur in the non-final position of some - * fold, so try the character before it, but have to - * special case the very first byte in the string, so - * we don't read outside the string */ - s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); - backup_count++; - } /* End of loop backwards through the string */ - - /* If there were only problematic characters in the string, - * will point to before s0, in which case the length - * should be 0, otherwise include the length of the - * non-problematic character just found */ - len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); - } - - /* Here, have found the final character, if any, that is - * non-problematic as far as ending the node without splitting - * it across a potential multi-char fold. contains the - * number of bytes in the node up-to and including that - * character, or is 0 if there is no such character, meaning - * the whole node contains only problematic characters. In - * this case, give up and just take the node as-is. We can't - * do any better */ - if (len == 0) { - len = full_len; - - } else { - - /* Here, the node does contain some characters that aren't - * problematic. If we didn't have to backup any, then the - * final character in the node is non-problematic, and we - * can take the node as-is */ - if (backup_count == 0) { - goto loopdone; + goto reparse; } - else if (backup_count == 1) { - /* If the final character is problematic, but the - * penultimate is not, back-off that last character to - * later start a new node with it */ - p = oldp; - goto loopdone; - } + oldp = old_oldp; + old_oldp = NULL; + + } /* End of loop backing up through the node */ + /* Here the node consists entirely of non-final multi-char + * folds. (Likely it is all 'f's or all 's's.) There's no + * decent place to split it, so give up and just take the + * whole thing */ - /* Here, the final non-problematic character is earlier - * in the input than the penultimate character. What we do - * is reparse from the beginning, going up only as far as - * this final ok one, thus guaranteeing that the node ends - * in an acceptable character. The reason we reparse is - * that we know how far in the character is, but we don't - * know how to correlate its position with the input parse. - * An alternate implementation would be to build that - * correlation as we go along during the original parse, - * but that would entail extra work for every node, whereas - * this code gets executed only when the string is too - * large for the node, and the final two characters are - * problematic, an infrequent occurrence. Yet another - * possible strategy would be to save the tail of the - * string, and the next time regatom is called, initialize - * with that. The problem with this is that unless you - * back off one more character, you won't be guaranteed - * regatom will get called again, unless regbranch, - * regpiece ... are also changed. If you do back off that - * extra character, so that there is input guaranteed to - * force calling regatom, you can't handle the case where - * just the first character in the node is acceptable. I - * (khw) decided to try this method which doesn't have that - * pitfall; if performance issues are found, we can do a - * combination of the current approach plus that one */ - upper_parse = len; - len = 0; - s = s0; - goto reparse; - } } /* End of verifying node ends with an appropriate char */ + p = oldp; + loopdone: /* Jumped to when encounters something that shouldn't be in the node */ /* Free up any over-allocated space; cast is to silence bogus * warning in MS VC */ change_engine_size(pRExC_state, - - (Ptrdiff_t) (initial_size - STR_SZ(len))); + - (Ptrdiff_t) (current_string_nodes - STR_SZ(len))); /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT @@ -14693,15 +14739,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { /* If the node type is EXACT here, check to see if it - * should be EXACTL, or EXACT_ONLY8. */ + * should be EXACTL, or EXACT_REQ8. */ if (node_type == EXACT) { if (LOC) { node_type = EXACTL; } else if (requires_utf8_target) { - node_type = EXACT_ONLY8; + node_type = EXACT_REQ8; + } + } + else if (node_type == LEXACT) { + if (requires_utf8_target) { + node_type = LEXACT_REQ8; } - } else if (FOLD) { + } + else if (FOLD) { if ( UNLIKELY(has_micro_sign || has_ss) && (node_type == EXACTFU || ( node_type == EXACTF && maybe_exactfu))) @@ -14718,9 +14770,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (maybe_exactfu) { node_type = EXACTFLU8; } + else if (UNLIKELY( + _invlist_contains_cp(PL_HasMultiCharFold, ender))) + { + /* A character that folds to more than one will + * match multiple characters, so can't be SIMPLE. + * We don't have to worry about this with EXACTFLU8 + * nodes just above, as they have already been + * folded (since the fold doesn't vary at run + * time). Here, if the final character in the node + * folds to multiple, it can't be simple. (This + * only has an effect if the node has only a single + * character, hence the final one, as elsewhere we + * turn off simple for nodes whose length > 1 */ + maybe_SIMPLE = 0; + } } else if (node_type == EXACTF) { /* Means is /di */ + /* This intermediate variable is needed solely because + * the asserts in the macro where used exceed Win32's + * literal string capacity */ + char first_char = * STRING(REGNODE_p(ret)); + /* If 'maybe_exactfu' is clear, then we need to stay * /di. If it is set, it means there are no code * points that match differently depending on UTF8ness @@ -14729,7 +14801,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (! maybe_exactfu) { RExC_seen_d_op = TRUE; } - else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's') + else if ( isALPHA_FOLD_EQ(first_char, 's') || isALPHA_FOLD_EQ(ender, 's')) { /* But, if the node begins or ends in an 's' we @@ -14749,16 +14821,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } if (requires_utf8_target && node_type == EXACTFU) { - node_type = EXACTFU_ONLY8; + node_type = EXACTFU_REQ8; } } OP(REGNODE_p(ret)) = node_type; - STR_LEN(REGNODE_p(ret)) = len; + setSTR_LEN(REGNODE_p(ret), len); RExC_emit += STR_SZ(len); /* If the node isn't a single character, it can't be SIMPLE */ - if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) { + if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) { maybe_SIMPLE = 0; } @@ -15928,8 +16000,11 @@ redo_curchar: /* Recurse, with the meat of the embedded expression */ RExC_parse++; - (void) handle_regex_sets(pRExC_state, ¤t, flagp, - depth+1, oregcomp_parse); + if (! handle_regex_sets(pRExC_state, ¤t, flagp, + depth+1, oregcomp_parse)) + { + RETURN_FAIL_ON_RESTART(*flagp, flagp); + } /* Here, 'current' contains the embedded expression's * inversion list, and RExC_parse points to the trailing @@ -15983,6 +16058,7 @@ redo_curchar: FALSE, /* Require return to be an ANYOF */ ¤t)) { + RETURN_FAIL_ON_RESTART(*flagp, flagp); goto regclass_failed; } @@ -16019,6 +16095,7 @@ redo_curchar: FALSE, /* Require return to be an ANYOF */ ¤t)) { + RETURN_FAIL_ON_RESTART(*flagp, flagp); goto regclass_failed; } @@ -16379,8 +16456,10 @@ redo_curchar: RExC_flags |= RXf_PMf_FOLD; } - if (!node) + if (!node) { + RETURN_FAIL_ON_RESTART(*flagp, flagp); goto regclass_failed; + } /* Fix up the node type if we are in locale. (We have pretended we are * under /u for the purposes of regclass(), as this construct will only @@ -18117,6 +18196,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Likewise for 'posixes' */ _invlist_union(posixes, cp_list, &cp_list); + SvREFCNT_dec(posixes); /* Likewise for anything else in the range that matched only * under UTF-8 */ @@ -18367,6 +18447,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, goto not_anyof; } } + /* For well-behaved locales, some classes are subsets of others, * so complementing the subset and including the non-complemented * superset should match everything, like [\D[:alnum:]], and @@ -18471,7 +18552,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Next see if can optimize classes that contain just a few code points * into an EXACTish node. The reason to do this is to let the - * optimizer join this node with adjacent EXACTish ones. + * optimizer join this node with adjacent EXACTish ones, and ANYOF + * nodes require conversion to code point from UTF-8. * * An EXACTFish node can be generated even if not under /i, and vice * versa. But care must be taken. An EXACTFish node has to be such @@ -18519,7 +18601,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, op = (FOLD) ? EXACTFL : EXACTL; } else if (! FOLD) { /* Not /l and not /i */ - op = (start[0] < 256) ? EXACT : EXACT_ONLY8; + op = (start[0] < 256) ? EXACT : EXACT_REQ8; } else if (start[0] < 256) { /* /i, not /l, and the code point is small */ @@ -18549,8 +18631,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, applies to it */ op = _invlist_contains_cp(PL_InMultiCharFold, start[0]) - ? EXACTFU_ONLY8 - : EXACT_ONLY8; + ? EXACTFU_REQ8 + : EXACT_REQ8; } value = start[0]; @@ -18714,7 +18796,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ? EXACTFLU8 : (ASCII_FOLD_RESTRICTED) ? EXACTFAA - : EXACTFU_ONLY8; + : EXACTFU_REQ8; value = folded; } } /* Below, the lowest code point < 256 */ @@ -18796,7 +18878,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = regnode_guts(pRExC_state, op, len, "exact"); FILL_NODE(ret, op); RExC_emit += 1 + STR_SZ(len); - STR_LEN(REGNODE_p(ret)) = len; + setSTR_LEN(REGNODE_p(ret), len); if (len == 1) { *STRING(REGNODE_p(ret)) = (U8) value; } @@ -18847,7 +18929,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (invlist_highest(cp_list) <= max_permissible) { UV this_start, this_end; - UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */ + UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */ U8 bits_differing = 0; Size_t full_cp_count = 0; bool first_time = TRUE; @@ -18924,6 +19006,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (op != END) { goto not_anyof; } + + /* XXX We could create an ANYOFR_LOW node here if we saved above if + * all were invariants, it wasn't inverted, and there is a single + * range. This would be faster than some of the posix nodes we + * create below like /\d/a, but would be twice the size. Without + * having actually measured the gain, khw doesn't think the + * tradeoff is really worth it */ } if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) { @@ -19062,7 +19151,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* We store the lowest possible first byte of the UTF-8 * representation, using the flags field. This allows for quick * ruling out of some inputs without having to convert from UTF-8 - * to code point. For EBCDIC, this has to be I8. */ + * to code point. For EBCDIC, we use I8, as not doing that + * transformation would not rule out nearly so many things */ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]); /* If the first UTF-8 start byte for the highest code point in the @@ -19601,8 +19691,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) STATIC void S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) { - /* 'size' is the delta to add or subtract from the current memory allocated - * to the regex engine being constructed */ + /* 'size' is the delta number of smallest regnode equivalents to add or + * subtract from the current memory allocated to the regex engine being + * constructed. */ PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; @@ -19634,8 +19725,8 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) { - /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns - * and increments RExC_size and RExC_emit + /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode + * equivalents space. It aligns and increments RExC_size and RExC_emit * * It returns the regnode's offset into the regex engine program */ @@ -19944,15 +20035,17 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, #endif if ( exact ) { switch (OP(REGNODE_p(scan))) { + case LEXACT: case EXACT: - case EXACT_ONLY8: + case LEXACT_REQ8: + case EXACT_REQ8: case EXACTL: case EXACTF: case EXACTFU_S_EDGE: case EXACTFAA_NO_TRIE: case EXACTFAA: case EXACTFU: - case EXACTFU_ONLY8: + case EXACTFU_REQ8: case EXACTFLU8: case EXACTFUP: case EXACTFL: @@ -21520,9 +21613,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) /* As a final resort, output the range or subrange as hex. */ - this_end = (end < NUM_ANYOF_CODE_POINTS) - ? end - : NUM_ANYOF_CODE_POINTS - 1; + if (start >= NUM_ANYOF_CODE_POINTS) { + this_end = end; + } + else { /* Have to split range at the bitmap boundary */ + this_end = (end < NUM_ANYOF_CODE_POINTS) + ? end + : NUM_ANYOF_CODE_POINTS - 1; + } #if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) ? "\\x%02" UVXf "-\\x%02" UVXf @@ -22912,7 +23010,7 @@ Perl_parse_uniprop_string(pTHX_ Titlecase Mapping (both full and simple) Uppercase Mapping (both full and simple) * Move the part that looks at the property values into a perl - * script, like utf8_heavy.pl is done. This makes things somewhat + * script, like utf8_heavy.pl was done. This makes things somewhat * easier, but most importantly, it avoids always adding all these * strings to the memory usage when the feature is little-used. *