X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1a7108fea1f14e60c0d69b29980183cf21288bef..6e0d5e3a8141f6b1419c181803856513ce75669e:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 9782f19..f77e5ca 100644 --- a/regcomp.c +++ b/regcomp.c @@ -182,11 +182,10 @@ struct RExC_state_t { through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; + I32 in_lookahead; I32 contains_locale; I32 override_recoding; -#ifdef EBCDIC - I32 recode_x_to_native; -#endif + I32 recode_x_to_native; I32 in_multi_char_class; struct reg_code_blocks *code_blocks;/* positions of literal (?{}) within pattern */ @@ -244,7 +243,6 @@ struct RExC_state_t { #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs under /d from /u ? */ - #ifdef RE_TRACK_PATTERN_OFFSETS # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the others */ @@ -273,10 +271,17 @@ struct RExC_state_t { #define RExC_study_chunk_recursed_bytes \ (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_in_lookahead (pRExC_state->in_lookahead) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) + #ifdef EBCDIC -# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) +# define SET_recode_x_to_native(x) \ + STMT_START { RExC_recode_x_to_native = (x); } STMT_END +#else +# define SET_recode_x_to_native(x) NOOP #endif + #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define RExC_frame_head (pRExC_state->frame_head) #define RExC_frame_last (pRExC_state->frame_last) @@ -352,7 +357,7 @@ struct RExC_state_t { /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is * a flag that indicates we need to override /d with /u as a result of * something in the pattern. It should only be used in regards to calling - * set_regex_charset() or get_regex_charse() */ + * set_regex_charset() or get_regex_charset() */ #define REQUIRE_UNI_RULES(flagp, restart_retval) \ STMT_START { \ if (DEPENDS_SEMANTICS) { \ @@ -372,12 +377,8 @@ struct RExC_state_t { #define REQUIRE_BRANCHJ(flagp, restart_retval) \ STMT_START { \ RExC_use_BRANCHJ = 1; \ - if (LIKELY(! IN_PARENS_PASS)) { \ - /* No need to restart the parse immediately if we're \ - * going to reparse anyway to count parens */ \ - *flagp |= RESTART_PARSE; \ - return restart_retval; \ - } \ + *flagp |= RESTART_PARSE; \ + return restart_retval; \ } STMT_END /* Until we have completed the parse, we leave RExC_total_parens at 0 or @@ -748,6 +749,10 @@ static const scan_data_t zero_scan_data = { Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) +#define FAIL3(msg,arg1,arg2) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ + arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan */ @@ -845,7 +850,8 @@ static const scan_data_t zero_scan_data = { #define UPDATE_WARNINGS_LOC(loc) \ STMT_START { \ if (TO_OUTPUT_WARNINGS(loc)) { \ - RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \ + RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ + - RExC_precomp; \ } \ } STMT_END @@ -1582,7 +1588,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, unsigned int i; const U32 n = ARG(node); bool new_node_has_latin1 = FALSE; - const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr)) + const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb)) ? 0 : ANYOF_FLAGS(node); @@ -1637,7 +1643,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } /* Add in the points from the bit map */ - if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) { + if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) { for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { if (ANYOF_BITMAP_TEST(node, i)) { unsigned int start = i++; @@ -1724,7 +1730,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * another SSC or a regular ANYOF class. Can create false positives. */ SV* anded_cp_list; - U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr) + U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb) ? 0 : ANYOF_FLAGS(and_with); U8 anded_flags; @@ -1910,7 +1916,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, SV* ored_cp_list; U8 ored_flags; - U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr) + U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb) ? 0 : ANYOF_FLAGS(or_with); @@ -2136,6 +2142,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); + SvREFCNT_dec(invlist); /* Make sure is clone-safe */ ssc->invlist = NULL; @@ -2526,7 +2533,8 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(UTF8_MAXBYTES); \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + *kapow = '\0'; \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -2695,7 +2703,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, #endif switch (flags) { - case EXACT: case EXACT_ONLY8: case EXACTL: break; + case EXACT: case EXACT_REQ8: case EXACTL: break; case EXACTFAA: case EXACTFUP: case EXACTFU: @@ -2710,7 +2718,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL) + if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -2786,8 +2794,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( noper < tail && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_ONLY8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 || OP(noper) == EXACTFUP)))) { uc= (U8*)STRING(noper); @@ -3004,8 +3012,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( noper < tail && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_ONLY8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 || OP(noper) == EXACTFUP)))) { const U8 *uc= (U8*)STRING(noper); @@ -3229,8 +3237,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( noper < tail && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_ONLY8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 || OP(noper) == EXACTFUP)))) { const U8 *uc= (U8*)STRING(noper); @@ -3545,9 +3553,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( state==1 ) { OP( convert ) = nodetype; str=STRING(convert); - STR_LEN(convert)=0; + setSTR_LEN(convert, 0); } - STR_LEN(convert) += len; + setSTR_LEN(convert, STR_LEN(convert) + len); while (len--) *str++ = *ch++; } else { @@ -3987,8 +3995,9 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * using /iaa matching will be doing so almost entirely with ASCII * strings, so this should rarely be encountered in practice */ -#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ - if (PL_regkind[OP(scan)] == EXACT) \ +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ + if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \ + && OP(scan) != LEXACT_REQ8) \ join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1) STATIC U32 @@ -4053,16 +4062,16 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, /* Joining something that requires UTF-8 with something that * doesn't, means the result requires UTF-8. */ - if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) { - OP(scan) = EXACT_ONLY8; + if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { + OP(scan) = EXACT_REQ8; } - else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) { + else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { ; /* join is compatible, no need to change OP */ } - else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) { - OP(scan) = EXACTFU_ONLY8; + else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { + OP(scan) = EXACTFU_REQ8; } - else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) { + else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { ; /* join is compatible, no need to change OP */ } else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { @@ -4085,7 +4094,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * node. And if the only adjacent node is EXACTF, they get * absorbed into that, under the theory that a longer node is * better than two shorter ones, even if one is EXACTFU. Note - * that EXACTFU_ONLY8 is generated only for UTF-8 patterns, + * that EXACTFU_REQ8 is generated only for UTF-8 patterns, * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ if (STRING(n)[STR_LEN(n)-1] == 's') { @@ -4154,7 +4163,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, merged++; NEXT_OFF(scan) += NEXT_OFF(n); - STR_LEN(scan) += STR_LEN(n); + setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n)); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); @@ -4193,7 +4202,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this final joining, sequences could have been split over boundaries, and * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ - if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) { + if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { U8* s0 = (U8*) STRING(scan); U8* s = s0; U8* s_end = s0 + STR_LEN(scan); @@ -4536,31 +4545,31 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, */ JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); - /* Follow the next-chain of the current node and optimize - away all the NOTHINGs from it. */ - if (OP(scan) != CURLYX) { - const int max = (reg_off_by_arg[OP(scan)] - ? I32_MAX - /* I32 may be smaller than U16 on CRAYs! */ - : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); - int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); - int noff; - regnode *n = scan; - - /* Skip NOTHING and LONGJMP. */ - while ((n = regnext(n)) - && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) - || ((OP(n) == LONGJMP) && (noff = ARG(n)))) - && off + noff < max) - off += noff; - if (reg_off_by_arg[OP(scan)]) - ARG(scan) = off; - else - NEXT_OFF(scan) = off; - } + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ( (n = regnext(n)) + && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } - /* The principal pseudo-switch. Cannot be a switch, since we - look into several different things. */ + /* The principal pseudo-switch. Cannot be a switch, since we look into + * several different things. */ if ( OP(scan) == DEFINEP ) { SSize_t minlen = 0; SSize_t deltanext = 0; @@ -4774,7 +4783,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( SvIV(re_trie_maxbuff)>=0 ) { regnode *cur; regnode *first = (regnode *)NULL; - regnode *last = (regnode *)NULL; + regnode *prev = (regnode *)NULL; regnode *tail = scan; U8 trietype = 0; U32 count=0; @@ -4852,9 +4861,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ----------------+----------- NOTHING | NOTHING EXACT | EXACT - EXACT_ONLY8 | EXACT + EXACT_REQ8 | EXACT EXACTFU | EXACTFU - EXACTFU_ONLY8 | EXACTFU + EXACTFU_REQ8 | EXACTFU EXACTFUP | EXACTFU EXACTFAA | EXACTFAA EXACTL | EXACTL @@ -4864,10 +4873,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ ? NOTHING \ - : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \ + : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ ? EXACT \ : ( EXACTFU == (X) \ - || EXACTFU_ONLY8 == (X) \ + || EXACTFU_REQ8 == (X) \ || EXACTFUP == (X) ) \ ? EXACTFU \ : ( EXACTFAA == (X) ) \ @@ -4905,7 +4914,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); } Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", - REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] ); }); @@ -4951,7 +4960,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if ( trietype == NOTHING ) trietype = noper_trietype; - last = cur; + prev = cur; } if (first) count++; @@ -4961,7 +4970,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * noper may either be a triable node which can * not be tried together with the current trie, * or a non triable node */ - if ( last ) { + if ( prev ) { /* If last is set and trietype is not * NOTHING then we have found at least two * triable branch sequences in a row of a @@ -4974,7 +4983,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, make_trie( pRExC_state, startbranch, first, cur, tail, count, trietype, depth+1 ); - last = NULL; /* note: we clear/update + prev = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ } @@ -5003,12 +5012,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, Perl_re_indentf( aTHX_ "- %s (%d) ", depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", - REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), PL_reg_name[trietype] ); }); - if ( last && trietype ) { + if ( prev && trietype ) { if ( trietype != NOTHING ) { /* the last branch of the sequence was part of * a trie, so we have to construct it here @@ -5053,9 +5062,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, OP(opt)= OPTIMIZED; } } - } /* end if ( last) */ + } /* end if ( prev) */ } /* TRIE_MAXBUF is non zero */ - } /* do trie */ } @@ -5191,7 +5199,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else if ( OP(scan) == EXACT - || OP(scan) == EXACT_ONLY8 + || OP(scan) == LEXACT + || OP(scan) == EXACT_REQ8 + || OP(scan) == LEXACT_REQ8 || OP(scan) == EXACTL) { SSize_t l = STR_LEN(scan); @@ -5276,7 +5286,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (flags & SCF_DO_STCLASS) { - SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan); + SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); assert(EXACTF_invlist); if (flags & SCF_DO_STCLASS_AND) { @@ -5313,7 +5323,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); if ( OP(next) == EXACT - || OP(next) == EXACT_ONLY8 + || OP(next) == LEXACT + || OP(next) == EXACT_REQ8 + || OP(next) == LEXACT_REQ8 || OP(next) == EXACTL || (flags & SCF_DO_STCLASS)) { @@ -5852,6 +5864,7 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", case ANYOFH: case ANYOFHb: case ANYOFHr: + case ANYOFHs: case ANYOF: if (flags & SCF_DO_STCLASS_AND) ssc_and(pRExC_state, data->start_class, @@ -5877,6 +5890,26 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", break; } + case ANYOFR: + case ANYOFRb: + { + SV* cp_list = NULL; + + cp_list = _add_range_to_invlist(cp_list, + ANYOFRbase(scan), + ANYOFRbase(scan) + ANYOFRdelta(scan)); + + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, cp_list, invert); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, cp_list, invert); + } + + SvREFCNT_dec_NN(cp_list); + break; + } + case NPOSIXL: invert = 1; /* FALLTHROUGH */ @@ -6165,7 +6198,6 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", } #endif } - else if (OP(scan) == OPEN) { if (stopparen != (I32)ARG(scan)) pars++; @@ -7387,28 +7419,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_r(if (!PL_colorset) reginitcolors()); - /* Initialize these here instead of as-needed, as is quick and avoids - * having to test them each time otherwise */ - if (! PL_InBitmap) { -#ifdef DEBUGGING - char * dump_len_string; -#endif - - /* This is calculated here, because the Perl program that generates the - * static global ones doesn't currently have access to - * NUM_ANYOF_CODE_POINTS */ - PL_InBitmap = _new_invlist(2); - PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, - NUM_ANYOF_CODE_POINTS - 1); -#ifdef DEBUGGING - dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); - if ( ! dump_len_string - || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) - { - PL_dump_re_max_len = 60; /* A reasonable default */ - } -#endif - } pRExC_state->warn_text = NULL; pRExC_state->unlexed_names = NULL; @@ -7578,6 +7588,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && memEQ(RX_PRECOMP(old_re), exp, plen) && !runtime_code /* with runtime code, always recompile */ ) { + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len); + Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n", + PL_colors[4], PL_colors[5], s); + }); return old_re; } @@ -7621,10 +7637,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen = 0; RExC_maxlen = 0; RExC_in_lookbehind = 0; + RExC_in_lookahead = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; -#ifdef EBCDIC RExC_recode_x_to_native = 0; -#endif RExC_in_multi_char_class = 0; RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; @@ -7835,6 +7850,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SetProgLen(RExC_rxi,RExC_size); #endif + DEBUG_DUMP_PRE_OPTIMIZE_r({ + SV * const sv = sv_newmortal(); + RXi_GET_DECL(RExC_rx, ri); + DEBUG_RExC_seen(); + Perl_re_printf( aTHX_ "Program before optimization:\n"); + + (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL, + sv, 0, 0); + }); + DEBUG_OPTIMISE_r( Perl_re_printf( aTHX_ "Starting post parse optimization\n"); ); @@ -7957,7 +7982,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Ignore EXACT as we deal with it later. */ if (PL_regkind[OP(first)] == EXACT) { if ( OP(first) == EXACT - || OP(first) == EXACT_ONLY8 + || OP(first) == LEXACT + || OP(first) == EXACT_REQ8 + || OP(first) == LEXACT_REQ8 || OP(first) == EXACTL) { NOOP; /* Empty, get anchored substr later. */ @@ -8303,7 +8330,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && nop == END) RExC_rx->extflags |= RXf_WHITE; else if ( RExC_rx->extflags & RXf_SPLIT - && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL) + && ( fop == EXACT || fop == LEXACT + || fop == EXACT_REQ8 || fop == LEXACT_REQ8 + || fop == EXACTL) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && nop == END ) @@ -8647,8 +8676,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, i = rx->sublen + rx->suboffset - rx->offs[0].end; } else - if ( 0 <= n && n <= (I32)rx->nparens && - (s1 = rx->offs[n].start) != -1 && + if (inRANGE(n, 0, (I32)rx->nparens) && + (s1 = rx->offs[n].start) != -1 && (t1 = rx->offs[n].end) != -1) { /* $&, ${^MATCH}, $1 ... */ @@ -9001,23 +9030,6 @@ S__invlist_array_init(SV* const invlist, const bool will_have_0) return zero_addr + *offset; } -PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) -{ - /* Sets the current number of elements stored in the inversion list. - * Updates SvCUR correspondingly */ - PERL_UNUSED_CONTEXT; - PERL_ARGS_ASSERT_INVLIST_SET_LEN; - - assert(is_invlist(invlist)); - - SvCUR_set(invlist, - (len == 0) - ? 0 - : TO_INTERNAL_SIZE(len + offset)); - assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); -} - STATIC void S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) { @@ -9168,6 +9180,7 @@ S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) invlist_iterfinish(invlist); *get_invlist_previous_index_addr(invlist) = 0; + SvPOK_on(invlist); /* This allows B to extract the PV */ } SV* @@ -9242,25 +9255,12 @@ Perl__new_invlist_C_array(pTHX_ const UV* const list) invlist_iterfinish(invlist); SvREADONLY_on(invlist); + SvPOK_on(invlist); return invlist; } STATIC void -S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) -{ - /* Grow the maximum size of an inversion list */ - - PERL_ARGS_ASSERT_INVLIST_EXTEND; - - assert(is_invlist(invlist)); - - /* Add one to account for the zero element at the beginning which may not - * be counted by the calling parameters */ - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); -} - -STATIC void S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) { @@ -10246,11 +10246,6 @@ Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, #endif -PERL_STATIC_INLINE SV* -S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { - return _add_range_to_invlist(invlist, cp, cp); -} - #ifndef PERL_IN_XSUB_RE void Perl__invlist_invert(pTHX_ SV* const invlist) @@ -10301,106 +10296,26 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) #endif -PERL_STATIC_INLINE STRLEN* -S_get_invlist_iter_addr(SV* invlist) -{ - /* Return the address of the UV that contains the current iteration - * position */ - - PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - - assert(is_invlist(invlist)); - - return &(((XINVLIST*) SvANY(invlist))->iterator); -} - -PERL_STATIC_INLINE void -S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ -{ - PERL_ARGS_ASSERT_INVLIST_ITERINIT; - - *get_invlist_iter_addr(invlist) = 0; -} - -PERL_STATIC_INLINE void -S_invlist_iterfinish(SV* invlist) -{ - /* Terminate iterator for invlist. This is to catch development errors. - * Any iteration that is interrupted before completed should call this - * function. Functions that add code points anywhere else but to the end - * of an inversion list assert that they are not in the middle of an - * iteration. If they were, the addition would make the iteration - * problematical: if the iteration hadn't reached the place where things - * were being added, it would be ok */ - - PERL_ARGS_ASSERT_INVLIST_ITERFINISH; - - *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; -} - -STATIC bool -S_invlist_iternext(SV* invlist, UV* start, UV* end) -{ - /* An C call on must be used to set this up. - * This call sets in <*start> and <*end>, the next range in . - * Returns if successful and the next call will return the next - * range; if was already at the end of the list. If the latter, - * <*start> and <*end> are unchanged, and the next call to this function - * will start over at the beginning of the list */ - - STRLEN* pos = get_invlist_iter_addr(invlist); - UV len = _invlist_len(invlist); - UV *array; - - PERL_ARGS_ASSERT_INVLIST_ITERNEXT; - - if (*pos >= len) { - *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ - return FALSE; - } - - array = invlist_array(invlist); - - *start = array[(*pos)++]; - - if (*pos >= len) { - *end = UV_MAX; - } - else { - *end = array[(*pos)++] - 1; - } - - return TRUE; -} - PERL_STATIC_INLINE UV -S_invlist_highest(SV* const invlist) +S_invlist_lowest(SV* const invlist) { - /* Returns the highest code point that matches an inversion list. This API - * has an ambiguity, as it returns 0 under either the highest is actually + /* Returns the lowest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the lowest is actually * 0, or if the list is empty. If this distinction matters to you, check * for emptiness before calling this function */ UV len = _invlist_len(invlist); UV *array; - PERL_ARGS_ASSERT_INVLIST_HIGHEST; + PERL_ARGS_ASSERT_INVLIST_LOWEST; if (len == 0) { - return 0; + return 0; } array = invlist_array(invlist); - /* The last element in the array in the inversion list always starts a - * range that goes to infinity. That range may be for code points that are - * matched in the inversion list, or it may be for ones that aren't - * matched. In the latter case, the highest code point in the set is one - * less than the beginning of this range; otherwise it is the final element - * of this range: infinity */ - return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) - ? UV_MAX - : array[len - 1] - 1; + return array[0]; } STATIC SV * @@ -10581,7 +10496,7 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) * call SvREFCNT_dec() when done with it. */ STATIC SV* -S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) +S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) { dVAR; const U8 * s = (U8*)STRING(node); @@ -10590,7 +10505,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) /* Start out big enough for 2 separate code points */ SV* invlist = _new_invlist(4); - PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST; + PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; if (! UTF) { uc = *s; @@ -11064,7 +10979,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REG; DEBUG_PARSE("reg "); - max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD); assert(max_open); if (!SvIOK(max_open)) { @@ -11077,6 +10991,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) *flagp = 0; /* Tentatively. */ + if (RExC_in_lookbehind) { + RExC_in_lookbehind++; + } + if (RExC_in_lookahead) { + RExC_in_lookahead++; + } + /* Having this true makes it feasible to have a lot fewer tests for the * parse pointer being in scope. For example, we can write * while(isFOO(*RExC_parse)) RExC_parse++; @@ -11290,12 +11211,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) goto parse_rest; } - /* By doing this here, we avoid extra warnings for nested - * script runs */ - ckWARNexperimental(RExC_parse, - WARN_EXPERIMENTAL__SCRIPT_RUN, - "The script_run feature is experimental"); - if (paren == 's') { /* Here, we're starting a new regular script run */ ret = reg_node(pRExC_state, SROPEN); @@ -11318,10 +11233,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) return 0; } - REGTAIL(pRExC_state, ret, atomic); + if (! REGTAIL(pRExC_state, ret, atomic)) { + REQUIRE_BRANCHJ(flagp, 0); + } - REGTAIL(pRExC_state, atomic, - reg_node(pRExC_state, SRCLOSE)); + if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state, + SRCLOSE))) + { + REQUIRE_BRANCHJ(flagp, 0); + } RExC_in_script_run = 0; return ret; @@ -11335,9 +11255,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /*FALLTHROUGH*/ alpha_assertions: - ckWARNexperimental(RExC_parse, - WARN_EXPERIMENTAL__ALPHA_ASSERTIONS, - "The alpha_assertions feature is experimental"); RExC_seen_zerolen++; @@ -11541,10 +11458,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_parse >= RExC_end) { vFAIL("Sequence (?... not terminated"); } - - /* FALLTHROUGH */ + RExC_seen_zerolen++; + break; case '=': /* (?=...) */ RExC_seen_zerolen++; + RExC_in_lookahead++; break; case '!': /* (?!...) */ RExC_seen_zerolen++; @@ -11780,7 +11698,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_flags & RXf_PMf_COMPILETIME ); FLAGS(REGNODE_p(ret)) = 2; - REGTAIL(pRExC_state, ret, eval); + if (! REGTAIL(pRExC_state, ret, eval)) { + REQUIRE_BRANCHJ(flagp, 0); + } /* deal with the length of this later - MJD */ return ret; } @@ -11833,7 +11753,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) tail = reg(pRExC_state, 1, &flag, depth+1); RETURN_FAIL_ON_RESTART(flag, flagp); - REGTAIL(pRExC_state, ret, tail); + if (! REGTAIL(pRExC_state, ret, tail)) { + REQUIRE_BRANCHJ(flagp, 0); + } goto insert_if; } else if ( RExC_parse[0] == '<' /* (?()...) */ @@ -11925,15 +11847,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } nextchar(pRExC_state); insert_if: - REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state, + IFTHEN, 0))) + { + REQUIRE_BRANCHJ(flagp, 0); + } br = regbranch(pRExC_state, &flags, 1, depth+1); if (br == 0) { RETURN_FAIL_ON_RESTART(flags,flagp); FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, - LONGJMP, 0)); + if (! REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0))) + { + REQUIRE_BRANCHJ(flagp, 0); + } c = UCHARAT(RExC_parse); nextchar(pRExC_state); if (flags&HASWIDTH) @@ -11950,7 +11879,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); } - REGTAIL(pRExC_state, ret, lastbr); + if (! REGTAIL(pRExC_state, ret, lastbr)) { + REQUIRE_BRANCHJ(flagp, 0); + } if (flags&HASWIDTH) *flagp |= HASWIDTH; c = UCHARAT(RExC_parse); @@ -11965,16 +11896,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Switch (?(condition)... contains too many branches"); } ender = reg_node(pRExC_state, TAIL); - REGTAIL(pRExC_state, br, ender); + if (! REGTAIL(pRExC_state, br, ender)) { + REQUIRE_BRANCHJ(flagp, 0); + } if (lastbr) { - REGTAIL(pRExC_state, lastbr, ender); - REGTAIL(pRExC_state, REGNODE_OFFSET( - NEXTOPER( - NEXTOPER(REGNODE_p(lastbr)))), - ender); + if (! REGTAIL(pRExC_state, lastbr, ender)) { + REQUIRE_BRANCHJ(flagp, 0); + } + if (! REGTAIL(pRExC_state, + REGNODE_OFFSET( + NEXTOPER( + NEXTOPER(REGNODE_p(lastbr)))), + ender)) + { + REQUIRE_BRANCHJ(flagp, 0); + } } else - REGTAIL(pRExC_state, ret, ender); + if (! REGTAIL(pRExC_state, ret, ender)) { + REQUIRE_BRANCHJ(flagp, 0); + } #if 0 /* Removing this doesn't cause failures in the test suite -- khw */ RExC_size++; /* XXX WHY do we need this?!! For large programs it seems to be required @@ -12016,16 +11957,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) goto parse_rest; } /* end switch */ } - else { - if (*RExC_parse == '{') { - ckWARNregdep(RExC_parse + 1, - "Unescaped left brace in regex is " - "deprecated here (and will be fatal " - "in Perl 5.32), passed through"); - } - /* Not bothering to indent here, as the above 'else' is temporary - * */ - if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ + else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ capturing_parens: parno = RExC_npar; RExC_npar++; @@ -12092,7 +12024,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ paren = ':'; ret = 0; - } } } else /* ! paren */ @@ -12124,7 +12055,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) *flagp |= flags&SIMPLE; } if (is_open) { /* Starts with OPEN. */ - REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ + if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */ + REQUIRE_BRANCHJ(flagp, 0); + } } else if (paren != '?') /* Not Conditional */ ret = br; @@ -12132,12 +12065,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) lastbr = br; while (*RExC_parse == '|') { if (RExC_use_BRANCHJ) { + bool shut_gcc_up; + ender = reganode(pRExC_state, LONGJMP, 0); /* Append to the previous. */ - REGTAIL(pRExC_state, - REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))), - ender); + shut_gcc_up = REGTAIL(pRExC_state, + REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))), + ender); + PERL_UNUSED_VAR(shut_gcc_up); } nextchar(pRExC_state); if (freeze_paren) { @@ -12248,9 +12184,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) is_nothing= 0; } else if (op == BRANCHJ) { - REGTAIL_STUDY(pRExC_state, - REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))), - ender); + bool shut_gcc_up = REGTAIL_STUDY(pRExC_state, + REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))), + ender); + PERL_UNUSED_VAR(shut_gcc_up); /* for now we always disable this optimisation * / if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender)) @@ -12343,6 +12280,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_in_lookbehind) { RExC_in_lookbehind--; } + if (RExC_in_lookahead) { + RExC_in_lookahead--; + } if (after_freeze > RExC_npar) RExC_npar = after_freeze; return(ret); @@ -12562,7 +12502,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) const regnode_offset w = reg_node(pRExC_state, WHILEM); FLAGS(REGNODE_p(w)) = 0; - REGTAIL(pRExC_state, ret, w); + if (! REGTAIL(pRExC_state, ret, w)) { + REQUIRE_BRANCHJ(flagp, 0); + } if (RExC_use_BRANCHJ) { reginsert(pRExC_state, LONGJMP, ret, depth+1); reginsert(pRExC_state, NOTHING, ret, depth+1); @@ -12577,7 +12519,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (RExC_use_BRANCHJ) NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to LONGJMP. */ - REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); + if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state, + NOTHING))) + { + REQUIRE_BRANCHJ(flagp, 0); + } RExC_whilem_seen++; MARK_NAUGHTY_EXP(1, 4); /* compound interest */ } @@ -12649,16 +12595,22 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (*RExC_parse == '?') { nextchar(pRExC_state); reginsert(pRExC_state, MINMOD, ret, depth+1); - REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); + if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) { + REQUIRE_BRANCHJ(flagp, 0); + } } else if (*RExC_parse == '+') { regnode_offset ender; nextchar(pRExC_state); ender = reg_node(pRExC_state, SUCCEED); - REGTAIL(pRExC_state, ret, ender); + if (! REGTAIL(pRExC_state, ret, ender)) { + REQUIRE_BRANCHJ(flagp, 0); + } reginsert(pRExC_state, SUSPEND, ret, depth+1); ender = reg_node(pRExC_state, TAIL); - REGTAIL(pRExC_state, ret, ender); + if (! REGTAIL(pRExC_state, ret, ender)) { + REQUIRE_BRANCHJ(flagp, 0); + } } if (ISMULT2(RExC_parse)) { @@ -12899,9 +12851,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, value = (U8 *) SvPV(value_sv, value_len); /* See if the result is one code point vs 0 or multiple */ - if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv)) - ? UTF8SKIP(value) - : 1)) + if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv) + ? UTF8SKIP(value) + : 1))) { /* Here, exactly one code point. If that isn't what is wanted, * fail */ @@ -12952,11 +12904,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, sv_catsv(substitute_parse, value_sv); sv_catpv(substitute_parse, ")"); -#ifdef EBCDIC /* The value should already be native, so no need to convert on EBCDIC * platforms.*/ assert(! RExC_recode_x_to_native); -#endif } else { /* \N{U+...} */ @@ -13089,12 +13039,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, sv_catpvs(substitute_parse, ")"); -#ifdef EBCDIC /* The values are Unicode, and therefore have to be converted to native * on a non-Unicode (meaning non-ASCII) platform. */ - RExC_recode_x_to_native = 1; -#endif - + SET_recode_x_to_native(1); } /* Here, we have the string the name evaluates to, ready to be parsed, @@ -13119,9 +13066,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_start = save_start; RExC_parse = endbrace; RExC_end = orig_end; -#ifdef EBCDIC - RExC_recode_x_to_native = 0; -#endif + SET_recode_x_to_native(0); SvREFCNT_dec_NN(substitute_parse); @@ -13299,7 +13244,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; U8 op; int invert = 0; - U8 arg; GET_RE_DEBUG_FLAGS_DECL; @@ -13428,15 +13372,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= SIMPLE; goto finish_meta_pat; case 'K': - RExC_seen_zerolen++; - ret = reg_node(pRExC_state, KEEPS); - *flagp |= SIMPLE; - /* XXX:dmq : disabling in-place substitution seems to - * be necessary here to avoid cases of memory corruption, as - * with: C<$_="x" x 80; s/x\K/y/> -- rgs - */ - RExC_seen |= REG_LOOKBEHIND_SEEN; - goto finish_meta_pat; + if (!RExC_in_lookbehind && !RExC_in_lookahead) { + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_LOOKBEHIND_SEEN; + goto finish_meta_pat; + } + else { + ++RExC_parse; /* advance past the 'K' */ + vFAIL("\\K not permitted in lookahead/lookbehind"); + } case 'Z': ret = reg_node(pRExC_state, SEOL); *flagp |= SIMPLE; @@ -13454,13 +13404,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= HASWIDTH; goto finish_meta_pat; - case 'W': - invert = 1; - /* FALLTHROUGH */ - case 'w': - arg = ANYOF_WORDCHAR; - goto join_posix; - case 'B': invert = 1; /* FALLTHROUGH */ @@ -13579,85 +13522,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto finish_meta_pat; } - case 'D': - invert = 1; - /* FALLTHROUGH */ - case 'd': - arg = ANYOF_DIGIT; - if (! DEPENDS_SEMANTICS) { - goto join_posix; - } - - /* \d doesn't have any matches in the upper Latin1 range, hence /d - * is equivalent to /u. Changing to /u saves some branches at - * runtime */ - op = POSIXU; - goto join_posix_op_known; - case 'R': ret = reg_node(pRExC_state, LNBREAK); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; - case 'H': - invert = 1; - /* FALLTHROUGH */ + case 'd': + case 'D': case 'h': - arg = ANYOF_BLANK; - op = POSIXU; - goto join_posix_op_known; - - case 'V': - invert = 1; - /* FALLTHROUGH */ - case 'v': - arg = ANYOF_VERTWS; - op = POSIXU; - goto join_posix_op_known; - - case 'S': - invert = 1; - /* FALLTHROUGH */ - case 's': - arg = ANYOF_SPACE; - - join_posix: - - op = POSIXD + get_regex_charset(RExC_flags); - if (op > POSIXA) { /* /aa is same as /a */ - op = POSIXA; - } - else if (op == POSIXL) { - RExC_contains_locale = 1; - } - else if (op == POSIXD) { - RExC_seen_d_op = TRUE; - } - - join_posix_op_known: - - if (invert) { - op += NPOSIXD - POSIXD; - } - - ret = reg_node(pRExC_state, op); - FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg); - - *flagp |= HASWIDTH|SIMPLE; - /* FALLTHROUGH */ - - finish_meta_pat: - if ( UCHARAT(RExC_parse + 1) == '{' - && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end))) - { - RExC_parse += 2; - vFAIL("Unescaped left brace in regex is illegal here"); - } - nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(ret), 2); /* MJD */ - break; + case 'H': case 'p': case 'P': + case 's': + case 'S': + case 'v': + case 'V': + case 'w': + case 'W': + /* These all have the same meaning inside [brackets], and it knows + * how to do the best optimizations for them. So, pretend we found + * these within brackets, and let it do the work */ RExC_parse--; ret = regclass(pRExC_state, flagp, depth+1, @@ -13676,10 +13560,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, (UV) *flagp); - RExC_parse--; + RExC_parse--; /* regclass() leaves this one too far ahead */ + finish_meta_pat: + /* The escapes above that don't take a parameter can't be + * followed by a '{'. But 'pX', 'p{foo}' and + * correspondingly 'P' can be */ + if ( RExC_parse - parse_start == 1 + && UCHARAT(RExC_parse + 1) == '{' + && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end))) + { + RExC_parse += 2; + vFAIL("Unescaped left brace in regex is illegal here"); + } Set_Node_Offset(REGNODE_p(ret), parse_start); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2); + Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */ nextchar(pRExC_state); break; case 'N': @@ -13800,7 +13695,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) && num >= RExC_npar /* cannot be an octal escape if it starts with 8 */ && *RExC_parse != '8' - /* cannot be an octal escape it it starts with 9 */ + /* cannot be an octal escape if it starts with 9 */ && *RExC_parse != '9' ) { /* Probably not meant to be a backref, instead likely @@ -13900,14 +13795,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) STRLEN len = 0; UV ender = 0; char *p; - char *s; - -/* This allows us to fill a node with just enough spare so that if the final - * character folds, its expansion is guaranteed to fit */ -#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE) - + char *s, *old_s = NULL, *old_old_s = NULL; char *s0; - U8 upper_parse = MAX_NODE_STRING_SIZE; + U32 max_string_len = 255; + + /* We may have to reparse the node, artificially stopping filling + * it early, based on info gleaned in the first parse. This + * variable gives where we stop. Make it above the normal stopping + * place first time through; otherwise it would stop too early */ + U32 upper_fill = max_string_len + 1; /* We start out as an EXACT node, even if under /i, until we find a * character which is in a fold. The algorithm now segregates into @@ -13921,9 +13817,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) U8 node_type = EXACT; /* Assume the node will be fully used; the excess is given back at - * the end. We can't make any other length assumptions, as a byte - * input sequence could shrink down. */ - Ptrdiff_t initial_size = STR_SZ(256); + * the end. Under /i, we may need to temporarily add the fold of + * an extra character or two at the end to check for splitting + * multi-char folds, so allocate extra space for that. We can't + * make any other length assumptions, as a byte input sequence + * could shrink down. */ + Ptrdiff_t current_string_nodes = STR_SZ(max_string_len + + ((! FOLD) + ? 0 + : 2 * ((UTF) + ? UTF8_MAXBYTES_CASE + /* Max non-UTF-8 expansion is 2 */ : 2))); bool next_is_quantifier; char * oldp = NULL; @@ -13954,10 +13858,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* So is the MICRO SIGN */ bool has_micro_sign = FALSE; + /* Set when we fill up the current node and there is still more + * text to process */ + bool overflowed; + /* Allocate an EXACT node. The node_type may change below to * another EXACTish node, but since the size of the node doesn't * change, it works */ - ret = regnode_guts(pRExC_state, node_type, initial_size, "exact"); + ret = regnode_guts(pRExC_state, node_type, current_string_nodes, + "exact"); FILL_NODE(ret, node_type); RExC_emit++; @@ -13967,6 +13876,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reparse: + p = RExC_parse; + len = 0; + s = s0; + node_type = EXACT; + oldp = NULL; + maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); + maybe_SIMPLE = SIMPLE; + requires_utf8_target = FALSE; + has_ss = FALSE; + has_micro_sign = FALSE; + + continue_parse: + /* This breaks under rare circumstances. If folding, we do not * want to split a node at a character that is a non-final in a * multi-char fold, as an input string could just happen to want to @@ -13981,18 +13903,22 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) || UTF8_IS_START(UCHARAT(RExC_parse))); + overflowed = FALSE; + /* Here, we have a literal character. Find the maximal string of * them in the input that we can fit into a single EXACTish node. * We quit at the first non-literal or when the node gets full, or * under /i the categorization of folding/non-folding character * changes */ - for (p = RExC_parse; len < upper_parse && p < RExC_end; ) { + while (p < RExC_end && len < upper_fill) { /* In most cases each iteration adds one byte to the output. * The exceptions override this */ Size_t added_len = 1; oldp = p; + old_old_s = old_s; + old_s = s; /* White space has already been ignored */ assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 @@ -14161,13 +14087,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) UPDATE_WARNINGS_LOC(p - 1); ender = result; - if (ender < 0x100) { #ifdef EBCDIC + if (ender < 0x100) { if (RExC_recode_x_to_native) { ender = LATIN1_TO_NATIVE(ender); } -#endif } +#endif break; } case 'c': @@ -14324,20 +14250,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Ready to add 'ender' to the node */ if (! FOLD) { /* The simple case, just append the literal */ + not_fold_common: - not_fold_common: - if (UVCHR_IS_INVARIANT(ender) || ! UTF) { - *(s++) = (char) ender; - } - else { - U8 * new_s = uvchr_to_utf8((U8*)s, ender); - added_len = (char *) new_s - s; - s = (char *) new_s; + /* Don't output if it would overflow */ + if (UNLIKELY(len > max_string_len - ((UTF) + ? UVCHR_SKIP(ender) + : 1))) + { + overflowed = TRUE; + break; + } - if (ender > 255) { - requires_utf8_target = TRUE; - } + if (UVCHR_IS_INVARIANT(ender) || ! UTF) { + *(s++) = (char) ender; + } + else { + U8 * new_s = uvchr_to_utf8((U8*)s, ender); + added_len = (char *) new_s - s; + s = (char *) new_s; + + if (ender > 255) { + requires_utf8_target = TRUE; } + } } else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { @@ -14401,22 +14336,35 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } - if (UTF) { /* Use the folded value */ + if (UTF) { /* Alway use the folded value for UTF-8 + patterns */ if (UVCHR_IS_INVARIANT(ender)) { + if (UNLIKELY(len + 1 > max_string_len)) { + overflowed = TRUE; + break; + } + *(s)++ = (U8) toFOLD(ender); } else { - ender = _to_uni_fold_flags( + UV folded = _to_uni_fold_flags( ender, - (U8 *) s, + (U8 *) s, /* We have allocated extra space + in 's' so can't run off the + end */ &added_len, FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (UNLIKELY(len + added_len > max_string_len)) { + overflowed = TRUE; + break; + } + s += added_len; - if ( ender > 255 - && LIKELY(ender != GREEK_SMALL_LETTER_MU)) + if ( folded > 255 + && LIKELY(folded != GREEK_SMALL_LETTER_MU)) { /* U+B5 folds to the MU, so its possible for a * non-UTF-8 target to match it */ @@ -14424,63 +14372,77 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } } - else { - - /* Here is non-UTF8. First, see if the character's - * fold differs between /d and /u. */ - if (PL_fold[ender] != PL_fold_latin1[ender]) { - maybe_exactfu = FALSE; + else { /* Here is non-UTF8. */ + + /* The fold will be one or (rarely) two characters. + * Check that there's room for at least a single one + * before setting any flags, etc. Because otherwise an + * overflowing character could cause a flag to be set + * even though it doesn't end up in this node. (For + * the two character fold, we check again, before + * setting any flags) */ + if (UNLIKELY(len + 1 > max_string_len)) { + overflowed = TRUE; + break; } #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ || UNICODE_DOT_DOT_VERSION > 0) - /* On non-ancient Unicode versions, this includes the - * multi-char fold SHARP S to 'ss' */ - - if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S) - || ( isALPHA_FOLD_EQ(ender, 's') - && len > 0 - && isALPHA_FOLD_EQ(*(s-1), 's'))) - { - /* Here, we have one of the following: - * a) a SHARP S. This folds to 'ss' only under - * /u rules. If we are in that situation, - * fold the SHARP S to 'ss'. See the comments - * for join_exact() as to why we fold this - * non-UTF at compile time, and no others. - * b) 'ss'. When under /u, there's nothing - * special needed to be done here. The - * previous iteration handled the first 's', - * and this iteration will handle the second. - * If, on the otherhand it's not /u, we have - * to exclude the possibility of moving to /u, - * so that we won't generate an unwanted - * match, unless, at runtime, the target - * string is in UTF-8. - * */ + /* On non-ancient Unicodes, check for the only possible + * multi-char fold */ + if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { + /* This potential multi-char fold means the node + * can't be simple (because it could match more + * than a single char). And in some cases it will + * match 'ss', so set that flag */ + maybe_SIMPLE = 0; has_ss = TRUE; - maybe_exactfu = FALSE; /* Can't generate an - EXACTFU node (unless we - already are in one) */ - if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { - maybe_SIMPLE = 0; - if (node_type == EXACTFU) { - *(s++) = 's'; - - /* Let the code below add in the extra 's' */ - ender = 's'; - added_len = 2; + + /* It can't change to be an EXACTFU (unless already + * is one). We fold it iff under /u rules. */ + if (node_type != EXACTFU) { + maybe_exactfu = FALSE; + } + else { + if (UNLIKELY(len + 2 > max_string_len)) { + overflowed = TRUE; + break; } + + *(s++) = 's'; + *(s++) = 's'; + added_len = 2; + + goto done_with_this_char; } } + else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's')) + && LIKELY(len > 0) + && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's'))) + { + /* Also, the sequence 'ss' is special when not + * under /u. If the target string is UTF-8, it + * should match SHARP S; otherwise it won't. So, + * here we have to exclude the possibility of this + * node moving to /u.*/ + has_ss = TRUE; + maybe_exactfu = FALSE; + } #endif + /* Here, the fold will be a single character */ - else if (UNLIKELY(ender == MICRO_SIGN)) { + if (UNLIKELY(ender == MICRO_SIGN)) { has_micro_sign = TRUE; } + else if (PL_fold[ender] != PL_fold_latin1[ender]) { + + /* If the character's fold differs between /d and + * /u, this can't change to be an EXACTFU node */ + maybe_exactfu = FALSE; + } *(s++) = (DEPENDS_SEMANTICS) ? (char) toFOLD(ender) @@ -14495,6 +14457,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } /* End of adding current character to the node */ + done_with_this_char: + len += added_len; if (next_is_quantifier) { @@ -14506,168 +14470,505 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } /* End of loop through literal characters */ - /* Here we have either exhausted the input or ran out of room in - * the node. (If we encountered a character that can't be in the - * node, transfer is made directly to , and so we - * wouldn't have fallen off the end of the loop.) In the latter - * case, we artificially have to split the node into two, because - * we just don't have enough space to hold everything. This - * creates a problem if the final character participates in a - * multi-character fold in the non-final position, as a match that - * should have occurred won't, due to the way nodes are matched, - * and our artificial boundary. So back off until we find a non- - * problematic character -- one that isn't at the beginning or - * middle of such a fold. (Either it doesn't participate in any - * folds, or appears only in the final position of all the folds it - * does participate in.) A better solution with far fewer false - * positives, and that would fill the nodes more completely, would - * be to actually have available all the multi-character folds to - * test against, and to back-off only far enough to be sure that - * this node isn't ending with a partial one. is set - * further below (if we need to reparse the node) to include just - * up through that final non-problematic character that this code - * identifies, so when it is set to less than the full node, we can - * skip the rest of this */ - if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { - PERL_UINT_FAST8_T backup_count = 0; - - const STRLEN full_len = len; - - assert(len >= MAX_NODE_STRING_SIZE); - - /* Here, points to just beyond where we have output the - * final character of the node. Look backwards through the - * string until find a non- problematic character */ - - if (! UTF) { - - /* This has no multi-char folds to non-UTF characters */ - if (ASCII_FOLD_RESTRICTED) { - goto loopdone; + /* Here we have either exhausted the input or run out of room in + * the node. If the former, we are done. (If we encountered a + * character that can't be in the node, transfer is made directly + * to , and so we wouldn't have fallen off the end of the + * loop.) */ + if (LIKELY(! overflowed)) { + goto loopdone; + } + + /* Here we have run out of room. We can grow plain EXACT and + * LEXACT nodes. If the pattern is gigantic enough, though, + * eventually we'll have to artificially chunk the pattern into + * multiple nodes. */ + if (! LOC && (node_type == EXACT || node_type == LEXACT)) { + Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))]; + Size_t overhead_expansion = 0; + char temp[256]; + Size_t max_nodes_for_string; + Size_t achievable; + SSize_t delta; + + /* Here we couldn't fit the final character in the current + * node, so it will have to be reparsed, no matter what else we + * do */ + p = oldp; + + /* If would have overflowed a regular EXACT node, switch + * instead to an LEXACT. The code below is structured so that + * the actual growing code is common to changing from an EXACT + * or just increasing the LEXACT size. This means that we have + * to save the string in the EXACT case before growing, and + * then copy it afterwards to its new location */ + if (node_type == EXACT) { + overhead_expansion = regarglen[LEXACT] - regarglen[EXACT]; + RExC_emit += overhead_expansion; + Copy(s0, temp, len, char); + } + + /* Ready to grow. If it was a plain EXACT, the string was + * saved, and the first few bytes of it overwritten by adding + * an argument field. We assume, as we do elsewhere in this + * file, that one byte of remaining input will translate into + * one byte of output, and if that's too small, we grow again, + * if too large the excess memory is freed at the end */ + + max_nodes_for_string = U16_MAX - overhead - overhead_expansion; + achievable = MIN(max_nodes_for_string, + current_string_nodes + STR_SZ(RExC_end - p)); + delta = achievable - current_string_nodes; + + /* If there is just no more room, go finish up this chunk of + * the pattern. */ + if (delta <= 0) { + goto loopdone; + } + + change_engine_size(pRExC_state, delta + overhead_expansion); + current_string_nodes += delta; + max_string_len + = sizeof(struct regnode) * current_string_nodes; + upper_fill = max_string_len + 1; + + /* If the length was small, we know this was originally an + * EXACT node now converted to LEXACT, and the string has to be + * restored. Otherwise the string was untouched. 260 is just + * a number safely above 255 so don't have to worry about + * getting it precise */ + if (len < 260) { + node_type = LEXACT; + FILL_NODE(ret, node_type); + s0 = STRING(REGNODE_p(ret)); + Copy(temp, s0, len, char); + s = s0 + len; + } + + goto continue_parse; + } + else if (FOLD) { + bool splittable = FALSE; + bool backed_up = FALSE; + char * e; + char * s_start; + + /* Here is /i. Running out of room creates a problem if we are + * folding, and the split happens in the middle of a + * multi-character fold, as a match that should have occurred, + * won't, due to the way nodes are matched, and our artificial + * boundary. So back off until we aren't splitting such a + * fold. If there is no such place to back off to, we end up + * taking the entire node as-is. This can happen if the node + * consists entirely of 'f' or entirely of 's' characters (or + * things that fold to them) as 'ff' and 'ss' are + * multi-character folds. + * + * The Unicode standard says that multi character folds consist + * of either two or three characters. That means we would be + * splitting one if the final character in the node is at the + * beginning of either type, or is the second of a three + * character fold. + * + * At this point: + * ender is the code point of the character that won't fit + * in the node + * s points to just beyond the final byte in the node. + * It's where we would place ender if there were + * room, and where in fact we do place ender's fold + * in the code below, as we've over-allocated space + * for s0 (hence s) to allow for this + * e starts at 's' and advances as we append things. + * old_s is the same as 's'. (If ender had fit, 's' would + * have been advanced to beyond it). + * old_old_s points to the beginning byte of the final + * character in the node + * p points to the beginning byte in the input of the + * character beyond 'ender'. + * oldp points to the beginning byte in the input of + * 'ender'. + * + * In the case of /il, we haven't folded anything that could be + * affected by the locale. That means only above-Latin1 + * characters that fold to other above-latin1 characters get + * folded at compile time. To check where a good place to + * split nodes is, everything in it will have to be folded. + * The boolean 'maybe_exactfu' keeps track in /il if there are + * any unfolded characters in the node. */ + bool need_to_fold_loc = LOC && ! maybe_exactfu; + + /* If we do need to fold the node, we need a place to store the + * folded copy, and a way to map back to the unfolded original + * */ + char * locfold_buf = NULL; + Size_t * loc_correspondence = NULL; + + if (! need_to_fold_loc) { /* The normal case. Just + initialize to the actual node */ + e = s; + s_start = s0; + s = old_old_s; /* Point to the beginning of the final char + that fits in the node */ + } + else { + + /* Here, we have filled a /il node, and there are unfolded + * characters in it. If the runtime locale turns out to be + * UTF-8, there are possible multi-character folds, just + * like when not under /l. The node hence can't terminate + * in the middle of such a fold. To determine this, we + * have to create a folded copy of this node. That means + * reparsing the node, folding everything assuming a UTF-8 + * locale. (If at runtime it isn't such a locale, the + * actions here wouldn't have been necessary, but we have + * to assume the worst case.) If we find we need to back + * off the folded string, we do so, and then map that + * position back to the original unfolded node, which then + * gets output, truncated at that spot */ + + char * redo_p = RExC_parse; + char * redo_e; + char * old_redo_e; + + /* Allow enough space assuming a single byte input folds to + * a single byte output, plus assume that the two unparsed + * characters (that we may need) fold to the largest number + * of bytes possible, plus extra for one more worst case + * scenario. In the loop below, if we start eating into + * that final spare space, we enlarge this initial space */ + Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1; + + Newxz(locfold_buf, size, char); + Newxz(loc_correspondence, size, Size_t); + + /* Redo this node's parse, folding into 'locfold_buf' */ + redo_p = RExC_parse; + old_redo_e = redo_e = locfold_buf; + while (redo_p <= oldp) { + + old_redo_e = redo_e; + loc_correspondence[redo_e - locfold_buf] + = redo_p - RExC_parse; + + if (UTF) { + Size_t added_len; + + (void) _to_utf8_fold_flags((U8 *) redo_p, + (U8 *) RExC_end, + (U8 *) redo_e, + &added_len, + FOLD_FLAGS_FULL); + redo_e += added_len; + redo_p += UTF8SKIP(redo_p); + } + else { + + /* Note that if this code is run on some ancient + * Unicode versions, SHARP S doesn't fold to 'ss', + * but rather than clutter the code with #ifdef's, + * as is done above, we ignore that possibility. + * This is ok because this code doesn't affect what + * gets matched, but merely where the node gets + * split */ + if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) { + *redo_e++ = toLOWER_L1(UCHARAT(redo_p)); + } + else { + *redo_e++ = 's'; + *redo_e++ = 's'; + } + redo_p++; + } + + + /* If we're getting so close to the end that a + * worst-case fold in the next character would cause us + * to overflow, increase, assuming one byte output byte + * per one byte input one, plus room for another worst + * case fold */ + if ( redo_p <= oldp + && redo_e > locfold_buf + size + - (UTF8_MAXBYTES_CASE + 1)) + { + Size_t new_size = size + + (oldp - redo_p) + + UTF8_MAXBYTES_CASE + 1; + Ptrdiff_t e_offset = redo_e - locfold_buf; + + Renew(locfold_buf, new_size, char); + Renew(loc_correspondence, new_size, Size_t); + size = new_size; + + redo_e = locfold_buf + e_offset; + } } - while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { - backup_count++; + /* Set so that things are in terms of the folded, temporary + * string */ + s = old_redo_e; + s_start = locfold_buf; + e = redo_e; + + } + + /* Here, we have 's', 's_start' and 'e' set up to point to the + * input that goes into the node, folded. + * + * If the final character of the node and the fold of ender + * form the first two characters of a three character fold, we + * need to peek ahead at the next (unparsed) character in the + * input to determine if the three actually do form such a + * fold. Just looking at that character is not generally + * sufficient, as it could be, for example, an escape sequence + * that evaluates to something else, and it needs to be folded. + * + * khw originally thought to just go through the parse loop one + * extra time, but that doesn't work easily as that iteration + * could cause things to think that the parse is over and to + * goto loopdone. The character could be a '$' for example, or + * the character beyond could be a quantifier, and other + * glitches as well. + * + * The solution used here for peeking ahead is to look at that + * next character. If it isn't ASCII punctuation, then it will + * be something that continues in an EXACTish node if there + * were space. We append the fold of it to s, having reserved + * enough room in s0 for the purpose. If we can't reasonably + * peek ahead, we instead assume the worst case: that it is + * something that would form the completion of a multi-char + * fold. + * + * If we can't split between s and ender, we work backwards + * character-by-character down to s0. At each current point + * see if we are at the beginning of a multi-char fold. If so, + * that means we would be splitting the fold across nodes, and + * so we back up one and try again. + * + * If we're not at the beginning, we still could be at the + * final two characters of a (rare) three character fold. We + * check if the sequence starting at the character before the + * current position (and including the current and next + * characters) is a three character fold. If not, the node can + * be split here. If it is, we have to backup two characters + * and try again. + * + * Otherwise, the node can be split at the current position. + * + * The same logic is used for UTF-8 patterns and not */ + if (UTF) { + Size_t added_len; + + /* Append the fold of ender */ + (void) _to_uni_fold_flags( + ender, + (U8 *) e, + &added_len, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + e += added_len; + + /* 's' and the character folded to by ender may be the + * first two of a three-character fold, in which case the + * node should not be split here. That may mean examining + * the so-far unparsed character starting at 'p'. But if + * ender folded to more than one character, we already have + * three characters to look at. Also, we first check if + * the sequence consisting of s and the next character form + * the first two of some three character fold. If not, + * there's no need to peek ahead. */ + if ( added_len <= UTF8SKIP(e - added_len) + && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e))) + { + /* Here, the two do form the beginning of a potential + * three character fold. The unexamined character may + * or may not complete it. Peek at it. It might be + * something that ends the node or an escape sequence, + * in which case we don't know without a lot of work + * what it evaluates to, so we have to assume the worst + * case: that it does complete the fold, and so we + * can't split here. All such instances will have + * that character be an ASCII punctuation character, + * like a backslash. So, for that case, backup one and + * drop down to try at that position */ + if (isPUNCT(*p)) { + s = (char *) utf8_hop_back((U8 *) s, -1, + (U8 *) s_start); + backed_up = TRUE; + } + else { + /* Here, since it's not punctuation, it must be a + * real character, and we can append its fold to + * 'e' (having deliberately reserved enough space + * for this eventuality) and drop down to check if + * the three actually do form a folded sequence */ + (void) _to_utf8_fold_flags( + (U8 *) p, (U8 *) RExC_end, + (U8 *) e, + &added_len, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + e += added_len; + } } - len = s - s0 + 1; - } - else { - /* Point to the first byte of the final character */ - s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); + /* Here, we either have three characters available in + * sequence starting at 's', or we have two characters and + * know that the following one can't possibly be part of a + * three character fold. We go through the node backwards + * until we find a place where we can split it without + * breaking apart a multi-character fold. At any given + * point we have to worry about if such a fold begins at + * the current 's', and also if a three-character fold + * begins at s-1, (containing s and s+1). Splitting in + * either case would break apart a fold */ + do { + char *prev_s = (char *) utf8_hop_back((U8 *) s, -1, + (U8 *) s_start); + + /* If is a multi-char fold, can't split here. Backup + * one char and try again */ + if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) { + s = prev_s; + backed_up = TRUE; + continue; + } - while (s >= s0) { /* Search backwards until find - a non-problematic char */ - if (UTF8_IS_INVARIANT(*s)) { + /* If the two characters beginning at 's' are part of a + * three character fold starting at the character + * before s, we can't split either before or after s. + * Backup two chars and try again */ + if ( LIKELY(s > s_start) + && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e))) + { + s = prev_s; + s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start); + backed_up = TRUE; + continue; + } - /* There are no ascii characters that participate - * in multi-char folds under /aa. In EBCDIC, the - * non-ascii invariants are all control characters, - * so don't ever participate in any folds. */ - if (ASCII_FOLD_RESTRICTED - || ! IS_NON_FINAL_FOLD(*s)) - { - break; - } + /* Here there's no multi-char fold between s and the + * next character following it. We can split */ + splittable = TRUE; + break; + + } while (s > s_start); /* End of loops backing up through the node */ + + /* Here we either couldn't find a place to split the node, + * or else we broke out of the loop setting 'splittable' to + * true. In the latter case, the place to split is between + * the first and second characters in the sequence starting + * at 's' */ + if (splittable) { + s += UTF8SKIP(s); + } + } + else { /* Pattern not UTF-8 */ + if ( ender != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED) + { + *e++ = toLOWER_L1(ender); + } + else { + *e++ = 's'; + *e++ = 's'; + } + + if ( e - s <= 1 + && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e))) + { + if (isPUNCT(*p)) { + s--; + backed_up = TRUE; } - else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( - *s, *(s+1)))) + else { + if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED) { - break; + *e++ = toLOWER_L1(ender); + } + else { + *e++ = 's'; + *e++ = 's'; } } - else if (! _invlist_contains_cp( - PL_NonFinalFold, - valid_utf8_to_uvchr((U8 *) s, NULL))) - { - break; + } + + do { + if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) { + s--; + backed_up = TRUE; + continue; } - /* Here, the current character is problematic in that - * it does occur in the non-final position of some - * fold, so try the character before it, but have to - * special case the very first byte in the string, so - * we don't read outside the string */ - s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); - backup_count++; - } /* End of loop backwards through the string */ - - /* If there were only problematic characters in the string, - * will point to before s0, in which case the length - * should be 0, otherwise include the length of the - * non-problematic character just found */ - len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); - } + if ( LIKELY(s > s_start) + && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e))) + { + s -= 2; + backed_up = TRUE; + continue; + } - /* Here, have found the final character, if any, that is - * non-problematic as far as ending the node without splitting - * it across a potential multi-char fold. contains the - * number of bytes in the node up-to and including that - * character, or is 0 if there is no such character, meaning - * the whole node contains only problematic characters. In - * this case, give up and just take the node as-is. We can't - * do any better */ - if (len == 0) { - len = full_len; + splittable = TRUE; + break; - } else { + } while (s > s_start); - /* Here, the node does contain some characters that aren't - * problematic. If we didn't have to backup any, then the - * final character in the node is non-problematic, and we - * can take the node as-is */ - if (backup_count == 0) { - goto loopdone; + if (splittable) { + s++; } - else if (backup_count == 1) { + } - /* If the final character is problematic, but the - * penultimate is not, back-off that last character to - * later start a new node with it */ - p = oldp; - goto loopdone; + /* Here, we are done backing up. If we didn't backup at all + * (the likely case), just proceed */ + if (backed_up) { + + /* If we did find a place to split, reparse the entire node + * stopping where we have calculated. */ + if (splittable) { + + /* If we created a temporary folded string under /l, we + * have to map that back to the original */ + if (need_to_fold_loc) { + upper_fill = loc_correspondence[s - s_start]; + Safefree(locfold_buf); + Safefree(loc_correspondence); + + if (upper_fill == 0) { + FAIL2("panic: loc_correspondence[%d] is 0", + (int) (s - s_start)); + } + } + else { + upper_fill = s - s0; + } + goto reparse; + } + else if (need_to_fold_loc) { + Safefree(locfold_buf); + Safefree(loc_correspondence); } - /* Here, the final non-problematic character is earlier - * in the input than the penultimate character. What we do - * is reparse from the beginning, going up only as far as - * this final ok one, thus guaranteeing that the node ends - * in an acceptable character. The reason we reparse is - * that we know how far in the character is, but we don't - * know how to correlate its position with the input parse. - * An alternate implementation would be to build that - * correlation as we go along during the original parse, - * but that would entail extra work for every node, whereas - * this code gets executed only when the string is too - * large for the node, and the final two characters are - * problematic, an infrequent occurrence. Yet another - * possible strategy would be to save the tail of the - * string, and the next time regatom is called, initialize - * with that. The problem with this is that unless you - * back off one more character, you won't be guaranteed - * regatom will get called again, unless regbranch, - * regpiece ... are also changed. If you do back off that - * extra character, so that there is input guaranteed to - * force calling regatom, you can't handle the case where - * just the first character in the node is acceptable. I - * (khw) decided to try this method which doesn't have that - * pitfall; if performance issues are found, we can do a - * combination of the current approach plus that one */ - upper_parse = len; - len = 0; - s = s0; - goto reparse; + /* Here the node consists entirely of non-final multi-char + * folds. (Likely it is all 'f's or all 's's.) There's no + * decent place to split it, so give up and just take the + * whole thing */ + len = old_s - s0; } } /* End of verifying node ends with an appropriate char */ + /* We need to start the next node at the character that didn't fit + * in this one */ + p = oldp; + loopdone: /* Jumped to when encounters something that shouldn't be in the node */ /* Free up any over-allocated space; cast is to silence bogus * warning in MS VC */ change_engine_size(pRExC_state, - - (Ptrdiff_t) (initial_size - STR_SZ(len))); + - (Ptrdiff_t) (current_string_nodes - STR_SZ(len))); /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT @@ -14678,15 +14979,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { /* If the node type is EXACT here, check to see if it - * should be EXACTL, or EXACT_ONLY8. */ + * should be EXACTL, or EXACT_REQ8. */ if (node_type == EXACT) { if (LOC) { node_type = EXACTL; } else if (requires_utf8_target) { - node_type = EXACT_ONLY8; + node_type = EXACT_REQ8; } - } else if (FOLD) { + } + else if (node_type == LEXACT) { + if (requires_utf8_target) { + node_type = LEXACT_REQ8; + } + } + else if (FOLD) { if ( UNLIKELY(has_micro_sign || has_ss) && (node_type == EXACTFU || ( node_type == EXACTF && maybe_exactfu))) @@ -14703,9 +15010,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (maybe_exactfu) { node_type = EXACTFLU8; } + else if (UNLIKELY( + _invlist_contains_cp(PL_HasMultiCharFold, ender))) + { + /* A character that folds to more than one will + * match multiple characters, so can't be SIMPLE. + * We don't have to worry about this with EXACTFLU8 + * nodes just above, as they have already been + * folded (since the fold doesn't vary at run + * time). Here, if the final character in the node + * folds to multiple, it can't be simple. (This + * only has an effect if the node has only a single + * character, hence the final one, as elsewhere we + * turn off simple for nodes whose length > 1 */ + maybe_SIMPLE = 0; + } } else if (node_type == EXACTF) { /* Means is /di */ + /* This intermediate variable is needed solely because + * the asserts in the macro where used exceed Win32's + * literal string capacity */ + char first_char = * STRING(REGNODE_p(ret)); + /* If 'maybe_exactfu' is clear, then we need to stay * /di. If it is set, it means there are no code * points that match differently depending on UTF8ness @@ -14714,7 +15041,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (! maybe_exactfu) { RExC_seen_d_op = TRUE; } - else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's') + else if ( isALPHA_FOLD_EQ(first_char, 's') || isALPHA_FOLD_EQ(ender, 's')) { /* But, if the node begins or ends in an 's' we @@ -14734,16 +15061,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } if (requires_utf8_target && node_type == EXACTFU) { - node_type = EXACTFU_ONLY8; + node_type = EXACTFU_REQ8; } } OP(REGNODE_p(ret)) = node_type; - STR_LEN(REGNODE_p(ret)) = len; + setSTR_LEN(REGNODE_p(ret), len); RExC_emit += STR_SZ(len); /* If the node isn't a single character, it can't be SIMPLE */ - if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) { + if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) { maybe_SIMPLE = 0; } @@ -14795,7 +15122,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) assert(PL_regkind[OP(node)] == ANYOF); /* There is no bitmap for this node type */ - if (inRANGE(OP(node), ANYOFH, ANYOFHr)) { + if (inRANGE(OP(node), ANYOFH, ANYOFRb)) { return; } @@ -15913,8 +16240,11 @@ redo_curchar: /* Recurse, with the meat of the embedded expression */ RExC_parse++; - (void) handle_regex_sets(pRExC_state, ¤t, flagp, - depth+1, oregcomp_parse); + if (! handle_regex_sets(pRExC_state, ¤t, flagp, + depth+1, oregcomp_parse)) + { + RETURN_FAIL_ON_RESTART(*flagp, flagp); + } /* Here, 'current' contains the embedded expression's * inversion list, and RExC_parse points to the trailing @@ -15968,6 +16298,7 @@ redo_curchar: FALSE, /* Require return to be an ANYOF */ ¤t)) { + RETURN_FAIL_ON_RESTART(*flagp, flagp); goto regclass_failed; } @@ -16004,6 +16335,7 @@ redo_curchar: FALSE, /* Require return to be an ANYOF */ ¤t)) { + RETURN_FAIL_ON_RESTART(*flagp, flagp); goto regclass_failed; } @@ -16350,7 +16682,11 @@ redo_curchar: well have generated non-portable code points, but they're valid on this machine */ FALSE, /* similarly, no need for strict */ - FALSE, /* Require return to be an ANYOF */ + + /* We can optimize into something besides an ANYOF, except + * under /l, which needs to be ANYOF because of runtime + * checks for locale sanity, etc */ + ! in_locale, NULL ); @@ -16364,8 +16700,10 @@ redo_curchar: RExC_flags |= RXf_PMf_FOLD; } - if (!node) + if (!node) { + RETURN_FAIL_ON_RESTART(*flagp, flagp); goto regclass_failed; + } /* Fix up the node type if we are in locale. (We have pretended we are * under /u for the purposes of regclass(), as this construct will only @@ -16577,6 +16915,22 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings) UPDATE_WARNINGS_LOC(RExC_parse); } +PERL_STATIC_INLINE Size_t +S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max) +{ + const U8 * const start = s1; + const U8 * const send = start + max; + + PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS; + + while (s1 < send && *s1 == *s2) { + s1++; s2++; + } + + return s1 - start; +} + + STATIC AV * S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) { @@ -18102,6 +18456,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Likewise for 'posixes' */ _invlist_union(posixes, cp_list, &cp_list); + SvREFCNT_dec(posixes); /* Likewise for anything else in the range that matched only * under UTF-8 */ @@ -18249,14 +18604,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, |= ANYOFL_FOLD | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } - else if (cp_list) { /* Look to see if a 0-255 code point is in list */ - UV start, end; - invlist_iterinit(cp_list); - if (invlist_iternext(cp_list, &start, &end) && start < 256) { - anyof_flags |= ANYOFL_FOLD; - has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; - } - invlist_iterfinish(cp_list); + else if (cp_list && invlist_lowest(cp_list) < 256) { + /* If nothing is below 256, has no locale dependency; otherwise it + * does */ + anyof_flags |= ANYOFL_FOLD; + has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; } } else if ( DEPENDS_SEMANTICS @@ -18301,9 +18653,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (optimizable) { PERL_UINT_FAST8_T i; - Size_t partial_cp_count = 0; + UV partial_cp_count = 0; UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */ UV end[MAX_FOLD_FROMS+1] = { 0 }; + bool single_range = FALSE; if (cp_list) { /* Count the code points in enough ranges that we would see all the ones possible in any fold in this version @@ -18317,6 +18670,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, partial_cp_count += end[i] - start[i] + 1; } + if (i == 1) { + single_range = TRUE; + } invlist_iterfinish(cp_list); } @@ -18352,6 +18708,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, goto not_anyof; } } + /* For well-behaved locales, some classes are subsets of others, * so complementing the subset and including the non-complemented * superset should match everything, like [\D[:alnum:]], and @@ -18456,7 +18813,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Next see if can optimize classes that contain just a few code points * into an EXACTish node. The reason to do this is to let the - * optimizer join this node with adjacent EXACTish ones. + * optimizer join this node with adjacent EXACTish ones, and ANYOF + * nodes require conversion to code point from UTF-8. * * An EXACTFish node can be generated even if not under /i, and vice * versa. But care must be taken. An EXACTFish node has to be such @@ -18475,21 +18833,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * participates in no fold whatsoever, and having it EXACT tells the * optimizer the target string cannot match unless it has a colon in * it. - * - * We don't typically generate an EXACTish node if doing so would - * require changing the pattern to UTF-8, as that affects /d and - * otherwise is slower. However, under /i, not changing to UTF-8 can - * miss some potential multi-character folds. We calculate the - * EXACTish node, and then decide if something would be missed if we - * don't upgrade */ + */ if ( ! posixl && ! invert /* Only try if there are no more code points in the class than * in the max possible fold */ - && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1 - - && (start[0] < 256 || UTF || FOLD)) + && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1)) { if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) { @@ -18498,13 +18848,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (LOC) { - /* Here is /l: Use EXACTL, except /li indicates EXACTFL, - * as that means there is a fold not known until runtime so - * shows as only a single code point here. */ - op = (FOLD) ? EXACTFL : EXACTL; + /* Here is /l: Use EXACTL, except if there is a fold not + * known until runtime so shows as only a single code point + * here. For code points above 255, we know which can + * cause problems by having a potential fold to the Latin1 + * range. */ + if ( ! FOLD + || ( start[0] > 255 + && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0]))) + { + op = EXACTL; + } + else { + op = EXACTFL; + } } else if (! FOLD) { /* Not /l and not /i */ - op = (start[0] < 256) ? EXACT : EXACT_ONLY8; + op = (start[0] < 256) ? EXACT : EXACT_REQ8; } else if (start[0] < 256) { /* /i, not /l, and the code point is small */ @@ -18534,8 +18894,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, applies to it */ op = _invlist_contains_cp(PL_InMultiCharFold, start[0]) - ? EXACTFU_ONLY8 - : EXACT_ONLY8; + ? EXACTFU_REQ8 + : EXACT_REQ8; } value = start[0]; @@ -18699,7 +19059,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ? EXACTFLU8 : (ASCII_FOLD_RESTRICTED) ? EXACTFAA - : EXACTFU_ONLY8; + : EXACTFU_REQ8; value = folded; } } /* Below, the lowest code point < 256 */ @@ -18751,45 +19111,43 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (op != END) { + U8 len; - /* Here, we have calculated what EXACTish node we would use. - * But we don't use it if it would require converting the - * pattern to UTF-8, unless not using it could cause us to miss - * some folds (hence be buggy) */ - - if (! UTF && value > 255) { - SV * in_multis = NULL; - - assert(FOLD); - - /* If there is no code point that is part of a multi-char - * fold, then there aren't any matches, so we don't do this - * optimization. Otherwise, it could match depending on - * the context around us, so we do upgrade */ - _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis); - if (UNLIKELY(_invlist_len(in_multis) != 0)) { + /* Here, we have calculated what EXACTish node to use. Have to + * convert to UTF-8 if not already there */ + if (value > 255) { + if (! UTF) { + SvREFCNT_dec(cp_list);; REQUIRE_UTF8(flagp); } - else { - op = END; + + /* This is a kludge to the special casing issues with this + * ligature under /aa. FB05 should fold to FB06, but the + * call above to _to_uni_fold_flags() didn't find this, as + * it didn't use the /aa restriction in order to not miss + * other folds that would be affected. This is the only + * instance likely to ever be a problem in all of Unicode. + * So special case it. */ + if ( value == LATIN_SMALL_LIGATURE_LONG_S_T + && ASCII_FOLD_RESTRICTED) + { + value = LATIN_SMALL_LIGATURE_ST; } } - if (op != END) { - U8 len = (UTF) ? UVCHR_SKIP(value) : 1; + len = (UTF) ? UVCHR_SKIP(value) : 1; - ret = regnode_guts(pRExC_state, op, len, "exact"); - FILL_NODE(ret, op); - RExC_emit += 1 + STR_SZ(len); - STR_LEN(REGNODE_p(ret)) = len; - if (len == 1) { - *STRING(REGNODE_p(ret)) = (U8) value; - } - else { - uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value); - } - goto not_anyof; + ret = regnode_guts(pRExC_state, op, len, "exact"); + FILL_NODE(ret, op); + RExC_emit += 1 + STR_SZ(len); + setSTR_LEN(REGNODE_p(ret), len); + if (len == 1) { + *STRINGs(REGNODE_p(ret)) = (U8) value; } + else { + uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value); + } + goto not_anyof; } } @@ -18832,7 +19190,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (invlist_highest(cp_list) <= max_permissible) { UV this_start, this_end; - UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */ + UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */ U8 bits_differing = 0; Size_t full_cp_count = 0; bool first_time = TRUE; @@ -18869,7 +19227,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, full_cp_count += this_end - this_start + 1; } - invlist_iterfinish(cp_list); /* At the end of the loop, we count how many bits differ from * the bits in lowest code point, call the count 'd'. If the @@ -18898,8 +19255,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = reganode(pRExC_state, op, lowest_cp); FLAGS(REGNODE_p(ret)) = ANYOFM_mask; } + + done_anyofm: + invlist_iterfinish(cp_list); } - done_anyofm: if (inverted) { _invlist_invert(cp_list); @@ -18908,6 +19267,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (op != END) { goto not_anyof; } + + /* XXX We could create an ANYOFR_LOW node here if we saved above if + * all were invariants, it wasn't inverted, and there is a single + * range. This would be faster than some of the posix nodes we + * create below like /\d/a, but would be twice the size. Without + * having actually measured the gain, khw doesn't think the + * tradeoff is really worth it */ } if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) { @@ -19024,6 +19390,52 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvREFCNT_dec(intersection); } + /* If it is a single contiguous range, ANYOFR is an efficient regnode, + * both in size and speed. Currently, a 20 bit range base (smallest + * code point in the range), and a 12 bit maximum delta are packed into + * a 32 bit word. This allows for using it on all of the Unicode code + * points except for the highest plane, which is only for private use + * code points. khw doubts that a bigger delta is likely in real world + * applications */ + if ( single_range + && ! has_runtime_dependency + && anyof_flags == 0 + && start[0] < (1 << ANYOFR_BASE_BITS) + && end[0] - start[0] + < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1) + * CHARBITS - ANYOFR_BASE_BITS)))) + + { + U8 low_utf8[UTF8_MAXBYTES+1]; + U8 high_utf8[UTF8_MAXBYTES+1]; + + ret = reganode(pRExC_state, ANYOFR, + (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS)); + + /* Place the lowest UTF-8 start byte in the flags field, so as to + * allow efficient ruling out at run time of many possible inputs. + * */ + (void) uvchr_to_utf8(low_utf8, start[0]); + (void) uvchr_to_utf8(high_utf8, end[0]); + + /* If all code points share the same first byte, this can be an + * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can + * quickly rule out many inputs at run-time without having to + * compute the code point from UTF-8. For EBCDIC, we use I8, as + * not doing that transformation would not rule out nearly so many + * things */ + if (low_utf8[0] == high_utf8[0]) { + OP(REGNODE_p(ret)) = ANYOFRb; + ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0]; + } + else { + ANYOF_FLAGS(REGNODE_p(ret)) + = NATIVE_UTF8_TO_I8(low_utf8[0]); + } + + goto not_anyof; + } + /* If didn't find an optimization and there is no need for a bitmap, * optimize to indicate that */ if ( start[0] >= NUM_ANYOF_CODE_POINTS @@ -19034,38 +19446,63 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, U8 low_utf8[UTF8_MAXBYTES+1]; UV highest_cp = invlist_highest(cp_list); - op = ANYOFH; - /* Currently the maximum allowed code point by the system is * IV_MAX. Higher ones are reserved for future internal use. This * particular regnode can be used for higher ones, but we can't * calculate the code point of those. IV_MAX suffices though, as * it will be a large first byte */ - (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX)); + Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX)) + - low_utf8; /* We store the lowest possible first byte of the UTF-8 * representation, using the flags field. This allows for quick * ruling out of some inputs without having to convert from UTF-8 - * to code point. For EBCDIC, this has to be I8. */ + * to code point. For EBCDIC, we use I8, as not doing that + * transformation would not rule out nearly so many things */ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]); + op = ANYOFH; + /* If the first UTF-8 start byte for the highest code point in the * range is suitably small, we may be able to get an upper bound as * well */ if (highest_cp <= IV_MAX) { U8 high_utf8[UTF8_MAXBYTES+1]; - - (void) uvchr_to_utf8(high_utf8, highest_cp); + Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) + - high_utf8; /* If the lowest and highest are the same, we can get an exact - * first byte instead of a just minimum. We signal this with a - * different regnode */ + * first byte instead of a just minimum or even a sequence of + * exact leading bytes. We signal these with different + * regnodes */ if (low_utf8[0] == high_utf8[0]) { + Size_t len = find_first_differing_byte_pos(low_utf8, + high_utf8, + MIN(low_len, high_len)); + + if (len == 1) { - /* No need to convert to I8 for EBCDIC as this is an exact - * match */ - anyof_flags = low_utf8[0]; - op = ANYOFHb; + /* No need to convert to I8 for EBCDIC as this is an + * exact match */ + anyof_flags = low_utf8[0]; + op = ANYOFHb; + } + else { + op = ANYOFHs; + ret = regnode_guts(pRExC_state, op, + regarglen[op] + STR_SZ(len), + "anyofhs"); + FILL_NODE(ret, op); + ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len + = len; + Copy(low_utf8, /* Add the common bytes */ + ((struct regnode_anyofhs *) REGNODE_p(ret))->string, + len, U8); + RExC_emit += NODE_SZ_STR(REGNODE_p(ret)); + set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list, + NULL, only_utf8_locale_list); + goto not_anyof; + } } else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) { @@ -19151,8 +19588,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? listsv : NULL, + ? listsv + : NULL, only_utf8_locale_list); + SvREFCNT_dec(cp_list);; + SvREFCNT_dec(only_utf8_locale_list); return ret; not_anyof: @@ -19163,6 +19603,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start, RExC_parse - orig_parse);; SvREFCNT_dec(cp_list);; + SvREFCNT_dec(only_utf8_locale_list); return ret; } @@ -19203,11 +19644,12 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, SV *rv; if (cp_list) { - av_store(av, INVLIST_INDEX, cp_list); + av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list)); } if (only_utf8_locale_list) { - av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list); + av_store(av, ONLY_LOCALE_MATCHES_INDEX, + SvREFCNT_inc(only_utf8_locale_list)); } if (runtime_defns) { @@ -19585,8 +20027,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) STATIC void S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) { - /* 'size' is the delta to add or subtract from the current memory allocated - * to the regex engine being constructed */ + /* 'size' is the delta number of smallest regnode equivalents to add or + * subtract from the current memory allocated to the regex engine being + * constructed. */ PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; @@ -19618,8 +20061,8 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) { - /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns - * and increments RExC_size and RExC_emit + /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode + * equivalents space. It aligns and increments RExC_size * * It returns the regnode's offset into the regex engine program */ @@ -19860,14 +20303,15 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, scan = REGNODE_OFFSET(temp); } + assert(val >= scan); if (reg_off_by_arg[OP(REGNODE_p(scan))]) { assert((UV) (val - scan) <= U32_MAX); ARG_SET(REGNODE_p(scan), val - scan); } else { if (val - scan > U16_MAX) { - /* Since not all callers check the return value, populate this with - * something that won't loop and will likely lead to a crash if + /* Populate this with something that won't loop and will likely + * lead to a crash if the caller ignores the failure return, and * execution continues */ NEXT_OFF(REGNODE_p(scan)) = U16_MAX; return FALSE; @@ -19928,15 +20372,17 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, #endif if ( exact ) { switch (OP(REGNODE_p(scan))) { + case LEXACT: case EXACT: - case EXACT_ONLY8: + case LEXACT_REQ8: + case EXACT_REQ8: case EXACTL: case EXACTF: case EXACTFU_S_EDGE: case EXACTFAA_NO_TRIE: case EXACTFAA: case EXACTFU: - case EXACTFU_ONLY8: + case EXACTFU_REQ8: case EXACTFLU8: case EXACTFUP: case EXACTFL: @@ -19978,6 +20424,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, } else { if (val - scan > U16_MAX) { + /* Populate this with something that won't loop and will likely + * lead to a crash if the caller ignores the failure return, and + * execution continues */ NEXT_OFF(REGNODE_p(scan)) = U16_MAX; return FALSE; } @@ -20255,11 +20704,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SvPVCLEAR(sv); - if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ - /* It would be nice to FAIL() here, but this may be called from - regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", - (int)OP(o), (int)REGNODE_MAX); + if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */ + if (pRExC_state) { /* This gives more info, if we have it */ + FAIL3("panic: corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); + } + else { + Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); + } + } sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; @@ -20313,6 +20767,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ NULL, NULL, NULL, + 0, FALSE ); sv_catpvs(sv, "]"); @@ -20395,10 +20850,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ else if (k == LOGICAL) /* 2: embedded, otherwise 1 */ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); - else if (k == ANYOF) { - const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr) - ? 0 - : ANYOF_FLAGS(o); + else if (k == ANYOF || k == ANYOFR) { + U8 flags; + char * bitmap; + U32 arg; bool do_sep = FALSE; /* Do we need to separate various components of the output? */ /* Set if there is still an unresolved user-defined property */ @@ -20413,7 +20868,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* And things that aren't in the bitmap, but are small enough to be */ SV* bitmap_range_not_in_bitmap = NULL; - const bool inverted = flags & ANYOF_INVERT; + bool inverted; + + if (inRANGE(OP(o), ANYOFH, ANYOFRb)) { + flags = 0; + bitmap = NULL; + arg = 0; + } + else { + flags = ANYOF_FLAGS(o); + bitmap = ANYOF_BITMAP(o); + arg = ARG(o); + } if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) { if (ANYOFL_UTF8_LOCALE_REQD(flags)) { @@ -20424,17 +20890,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } } + inverted = flags & ANYOF_INVERT; + /* If there is stuff outside the bitmap, get it */ - if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { - (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + if (arg != ANYOF_ONLY_HAS_BITMAP) { + if (inRANGE(OP(o), ANYOFR, ANYOFRb)) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + ANYOFRbase(o), + ANYOFRbase(o) + ANYOFRdelta(o)); + } + else { + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, &unresolved, &only_utf8_locale_invlist, &nonbitmap_invlist); + } + /* The non-bitmap data may contain stuff that could fit in the * bitmap. This could come from a user-defined property being * finally resolved when this call was done; or much more likely * because there are matches that require UTF-8 to be valid, and so - * aren't in the bitmap. This is teased apart later */ + * aren't in the bitmap (or ANYOFR). This is teased apart later */ _invlist_intersection(nonbitmap_invlist, PL_InBitmap, &bitmap_range_not_in_bitmap); @@ -20454,19 +20930,26 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* Ready to start outputting. First, the initial left bracket */ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) { + /* ANYOFH by definition doesn't have anything that will fit inside the + * bitmap; ANYOFR may or may not. */ + if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr) + && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb) + || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS)) + { /* Then all the things that could fit in the bitmap */ do_sep = put_charclass_bitmap_innards(sv, - ANYOF_BITMAP(o), + bitmap, bitmap_range_not_in_bitmap, only_utf8_locale_invlist, o, + flags, /* Can't try inverting for a * better display if there * are things that haven't * been resolved */ - unresolved != NULL); + unresolved != NULL + || inRANGE(OP(o), ANYOFR, ANYOFRb)); SvREFCNT_dec(bitmap_range_not_in_bitmap); /* If there are user-defined properties which haven't been defined @@ -20552,15 +21035,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - if (inRANGE(OP(o), ANYOFH, ANYOFHr)) { + if (OP(o) == ANYOFHs) { + Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); + } + else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) { U8 lowest = (OP(o) != ANYOFHr) ? FLAGS(o) : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); - U8 highest = (OP(o) == ANYOFHb) - ? lowest - : OP(o) == ANYOFH + U8 highest = (OP(o) == ANYOFHr) + ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) + : (OP(o) == ANYOFH || OP(o) == ANYOFR) ? 0xFF - : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)); + : lowest; Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); if (lowest != highest) { Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); @@ -20578,7 +21064,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ _invlist_invert(cp_list); } - put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE); + put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); SvREFCNT_dec(cp_list); @@ -21496,9 +21982,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) /* As a final resort, output the range or subrange as hex. */ - this_end = (end < NUM_ANYOF_CODE_POINTS) - ? end - : NUM_ANYOF_CODE_POINTS - 1; + if (start >= NUM_ANYOF_CODE_POINTS) { + this_end = end; + } + else { /* Have to split range at the bitmap boundary */ + this_end = (end < NUM_ANYOF_CODE_POINTS) + ? end + : NUM_ANYOF_CODE_POINTS - 1; + } #if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) ? "\\x%02" UVXf "-\\x%02" UVXf @@ -21654,6 +22145,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, SV *nonbitmap_invlist, SV *only_utf8_locale_invlist, const regnode * const node, + const U8 flags, const bool force_as_is_display) { /* Appends to 'sv' a displayable version of the innards of the bracketed @@ -21670,6 +22162,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, * 'node' is the regex pattern ANYOF node. It is needed only when the * above two parameters are not null, and is passed so that this * routine can tease apart the various reasons for them. + * 'flags' is the flags field of 'node' * 'force_as_is_display' is TRUE if this routine should definitely NOT try * to invert things to see if that leads to a cleaner display. If * FALSE, this routine is free to use its judgment about doing this. @@ -21708,8 +22201,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, literally */ SV* inverted_display; /* The output string when we invert the inputs */ - U8 flags = (node) ? ANYOF_FLAGS(node) : 0; - bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted to match? */ /* We are biased in favor of displaying things without them being inverted, @@ -22055,7 +22546,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, else if ( op == PLUS || op == STAR) { DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); } - else if (PL_regkind[(U8)op] == EXACT) { + else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) { /* Literal string, where present. */ node += NODE_SZ_STR(node) - 1; node = NEXTOPER(node); @@ -22085,6 +22576,17 @@ Perl_init_uniprops(pTHX) { dVAR; +#ifdef DEBUGGING + char * dump_len_string; + + dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); + if ( ! dump_len_string + || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) + { + PL_dump_re_max_len = 60; /* A reasonable default */ + } +#endif + PL_user_def_props = newHV(); #ifdef USE_ITHREADS @@ -22094,7 +22596,7 @@ Perl_init_uniprops(pTHX) #endif - /* Set up the inversion list global variables */ + /* Set up the inversion list interpreter-level variables */ PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]); @@ -22136,6 +22638,7 @@ Perl_init_uniprops(pTHX) PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist); PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist); + PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist); PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); @@ -22153,9 +22656,6 @@ Perl_init_uniprops(pTHX) UNI__PERL_FOLDS_TO_MULTI_CHAR]); PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ UNI__PERL_IS_IN_MULTI_CHAR_FOLD]); - PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[ - UNI__PERL_NON_FINAL_FOLDS]); - PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist); @@ -22888,7 +23388,7 @@ Perl_parse_uniprop_string(pTHX_ Titlecase Mapping (both full and simple) Uppercase Mapping (both full and simple) * Move the part that looks at the property values into a perl - * script, like utf8_heavy.pl is done. This makes things somewhat + * script, like utf8_heavy.pl was done. This makes things somewhat * easier, but most importantly, it avoids always adding all these * strings to the memory usage when the feature is little-used. * @@ -22925,7 +23425,7 @@ Perl_parse_uniprop_string(pTHX_ /* Certain properties whose values are numeric need special handling. * They may optionally be prefixed by 'is'. Ignore that prefix for the * purposes of checking if this is one of those properties */ - if (memBEGINPs(lookup_name, name_len, "is")) { + if (memBEGINPs(lookup_name, j, "is")) { lookup_offset = 2; } @@ -23091,7 +23591,9 @@ Perl_parse_uniprop_string(pTHX_ } /* Store the first real character in the denominator */ - lookup_name[j++] = name[i]; + if (i < name_len) { + lookup_name[j++] = name[i]; + } } } @@ -23109,7 +23611,7 @@ Perl_parse_uniprop_string(pTHX_ /* If the original input began with 'In' or 'Is', it could be a subroutine * call to a user-defined property instead of a Unicode property name. */ - if ( non_pkg_begin + name_len > 2 + if ( name_len - non_pkg_begin > 2 && name[non_pkg_begin+0] == 'I' && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's')) {