X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b3d298be91e6629f3ce52b46c746fbe982114323..4d5e5e004ff6a117972995779fc75aaea42a3d4c:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 9ed4510..783349e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -126,7 +126,9 @@ typedef struct RExC_state_t { I32 whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ regnode *emit_bound; /* First regnode outside of the allocated space */ - regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ + regnode *emit; /* Code-emit pointer; if = &emit_dummy, + implies compiling, so don't emit */ + regnode emit_dummy; /* placeholder for emit to point to */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; @@ -187,6 +189,7 @@ typedef struct RExC_state_t { #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ #endif #define RExC_emit (pRExC_state->emit) +#define RExC_emit_dummy (pRExC_state->emit_dummy) #define RExC_emit_start (pRExC_state->emit_start) #define RExC_emit_bound (pRExC_state->emit_bound) #define RExC_naughty (pRExC_state->naughty) @@ -402,6 +405,7 @@ static const scan_data_t zero_scan_data = #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ #define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 #define UTF cBOOL(RExC_utf8) @@ -570,9 +574,9 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define ckWARN2regdep(loc,m, a1) STMT_START { \ +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -631,7 +635,7 @@ static const scan_data_t zero_scan_data = #define Set_Cur_Node_Offset #define Set_Node_Length_To_R(node,len) #define Set_Node_Length(node,len) -#define Set_Node_Cur_Length(node) +#define Set_Node_Cur_Length(node,start) #define Node_Offset(n) #define Node_Length(n) #define Set_Node_Offset_Length(node,offset,len) @@ -670,9 +674,8 @@ static const scan_data_t zero_scan_data = #define Set_Node_Length(node,len) \ Set_Node_Length_To_R((node)-RExC_emit_start, len) -#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) -#define Set_Node_Cur_Length(node) \ - Set_Node_Length(node, RExC_parse - parse_start) +#define Set_Node_Cur_Length(node, start) \ + Set_Node_Length(node, RExC_parse - start) /* Get offsets and lengths */ #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) @@ -845,7 +848,7 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define cl_init_zero cl_init /* 'AND' a given class with another one. Can create false positives. 'cl' * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if @@ -3907,8 +3910,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if ( /* ? quantifier ok, except for (?{ ... }) */ - (next_is_eval || !(mincount == 0 && maxcount == 1)) + if (!(flags & SCF_TRIE_DOING_RESTUDY) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ @@ -5114,16 +5118,15 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * sv_catsv_nomg(pat, msv); * that allows us to adjust code block indices if * needed */ - STRLEN slen, dlen; + STRLEN dlen; char *dst = SvPV_force_nomg(pat, dlen); - const char *src = SvPV_flags_const(msv, slen, 0); orig_patlen = dlen; if (SvUTF8(msv) && !SvUTF8(pat)) { S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); sv_setpvn(pat, dst, dlen); SvUTF8_on(pat); } - sv_catpvn_nomg(pat, src, slen); + sv_catsv_nomg(pat, msv); rx = msv; } else @@ -5518,6 +5521,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 sawlookahead = 0; I32 sawplus = 0; I32 sawopen = 0; + I32 sawminmod = 0; + regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool recompile = 0; bool runtime_code = 0; @@ -5796,7 +5801,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &PL_regdummy; + RExC_emit = &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -6026,7 +6031,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } reStudy: - r->minlen = minlen = sawlookahead = sawplus = sawopen = 0; + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT @@ -6095,12 +6100,15 @@ reStudy: * the only op that could be a regnode is PLUS, all the rest * will be regnode_1 or regnode_2. * + * (yves doesn't think this is true) */ if (OP(first) == PLUS) sawplus = 1; - else + else { + if (OP(first) == MINMOD) + sawminmod = 1; first += regarglen[OP(first)]; - + } first = NEXTOPER(first); first_next= regnext(first); } @@ -6171,7 +6179,7 @@ reStudy: first = NEXTOPER(first); goto again; } - if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6222,7 +6230,9 @@ reStudy: minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ &data, -1, NULL, NULL, - SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag + | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); @@ -6358,7 +6368,10 @@ reStudy: minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); + &data, -1, NULL, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS + |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); CHECK_RESTUDY_GOTO_butfirst(NOOP); @@ -7013,11 +7026,12 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* This section of code defines the inversion list object and its methods. The * interfaces are highly subject to change, so as much as possible is static to * this file. An inversion list is here implemented as a malloc'd C UV array - * with some added info that is placed as UVs at the beginning in a header - * portion. An inversion list for Unicode is an array of code points, sorted - * by ordinal number. The zeroth element is the first code point in the list. - * The 1th element is the first element beyond that not in the list. In other - * words, the first range is + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is * invlist[0]..(invlist[1]-1) * The other ranges follow. Thus every element whose index is divisible by two * marks the beginning of a range that is in the list, and every element not @@ -7035,9 +7049,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. * This implementation reserves an element at the beginning of each inversion - * list to contain 0 when the list contains 0, and contains 1 otherwise. The - * actual beginning of the list is either that element if 0, or the next one if - * 1. + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -7052,32 +7066,31 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * should eventually be made public */ /* The header definitions are in F */ -#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV)) -#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH) - -#define INVLIST_INITIAL_LEN 10 PERL_STATIC_INLINE UV* S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) { /* Returns a pointer to the first element in the inversion list's array. * This is called upon initialization of an inversion list. Where the - * array begins depends on whether the list has the code point U+0000 - * in it or not. The other parameter tells it whether the code that - * follows this call is about to put a 0 in the inversion list or not. - * The first element is either the element with 0, if 0, or the next one, - * if 1 */ + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ - UV* zero = get_invlist_zero_addr(invlist); + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *_get_invlist_len_addr(invlist)); + assert(! _invlist_len(invlist)); + + *zero_addr = 0; /* 1^1 = 0; 1^0 = 1 */ - *zero = 1 ^ will_have_0; - return zero + *zero; + *offset = 1 ^ will_have_0; + return zero_addr + *offset; } PERL_STATIC_INLINE UV* @@ -7091,53 +7104,44 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*_get_invlist_len_addr(invlist)); - assert(*get_invlist_zero_addr(invlist) == 0 - || *get_invlist_zero_addr(invlist) == 1); - - /* The array begins either at the element reserved for zero if the - * list contains 0 (that element will be set to 0), or otherwise the next - * element (in which case the reserved element will be set to 1). */ - return (UV *) (get_invlist_zero_addr(invlist) - + *get_invlist_zero_addr(invlist)); + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); } PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len) +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { - /* Sets the current number of elements stored in the inversion list */ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *_get_invlist_len_addr(invlist) = len; - - assert(len <= SvLEN(invlist)); - - SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); - /* If the list contains U+0000, that element is part of the header, - * and should not be counted as part of the array. It will contain - * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and - * subtract: - * SvCUR_set(invlist, - * TO_INTERNAL_SIZE(len - * - (*get_invlist_zero_addr(inv_list) ^ 1))); - * But, this is only valid if len is not 0. The consequences of not doing - * this is that the memory allocation code may think that 1 more UV is - * being used than actually is, and so might do an unnecessary grow. That - * seems worth not bothering to make this the precise amount. - * - * Note that when inverting, SvCUR shouldn't change */ + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); } PERL_STATIC_INLINE IV* S_get_invlist_previous_index_addr(pTHX_ SV* invlist) { - /* Return the address of the UV that is reserved to hold the cached index + /* Return the address of the IV that is reserved to hold the cached index * */ PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV))); + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); } PERL_STATIC_INLINE IV @@ -7170,22 +7174,13 @@ S_invlist_max(pTHX_ SV* const invlist) PERL_ARGS_ASSERT_INVLIST_MAX; - return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? _invlist_len(invlist) - : FROM_INTERNAL_SIZE(SvLEN(invlist)); -} - -PERL_STATIC_INLINE UV* -S_get_invlist_zero_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that is reserved to hold 0 if the inversion - * list contains 0. This has to be the last element of the heading, as the - * list proper starts with either it if 0, or the next element if not. - * (But we force it to contain either 0 or 1) */ - - PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); - return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV))); + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; } #ifndef PERL_IN_XSUB_RE @@ -7200,56 +7195,75 @@ Perl__new_invlist(pTHX_ IV initial_size) SV* new_list; if (initial_size < 0) { - initial_size = INVLIST_INITIAL_LEN; + initial_size = 10; } /* Allocate the initial space */ - new_list = newSV(TO_INTERNAL_SIZE(initial_size)); - invlist_set_len(new_list, 0); + new_list = newSV_type(SVt_INVLIST); - /* Force iterinit() to be used to get iteration to work */ - *get_invlist_iter_addr(new_list) = UV_MAX; + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); - /* This should force a segfault if a method doesn't initialize this - * properly */ - *get_invlist_zero_addr(new_list) = UV_MAX; + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; *get_invlist_previous_index_addr(new_list) = 0; - *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; -#if HEADER_LENGTH != 5 -# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length -#endif return new_list; } #endif STATIC SV* -S__new_invlist_C_array(pTHX_ UV* list) +S__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list * form, including internal fields. Thus this is a dangerous routine that - * should not be used in the wrong hands */ + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ - SV* invlist = newSV_type(SVt_PV); + SV* invlist = newSV_type(SVt_INVLIST); PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - SvPV_set(invlist, (char *) list); + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist))); - if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { - Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); - } + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); - /* Initialize the iteration pointer. - * XXX This could be done at compile time in charclass_invlists.h, but I - * (khw) am not confident that the suffixes for specifying the C constant - * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured - * to use 64 bits; might need a Configure probe */ + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); return invlist; @@ -7262,7 +7276,11 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); } PERL_STATIC_INLINE void @@ -7270,9 +7288,10 @@ S_invlist_trim(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ - SvPV_shrink_to_cur((SV *) invlist); } @@ -7288,11 +7307,13 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); UV len = _invlist_len(invlist); + bool offset; PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; if (len == 0) { /* Empty lists must be initialized */ - array = _invlist_array_init(invlist, start == 0); + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); } else { /* Here, the existing list is non-empty. The current max entry in the @@ -7315,6 +7336,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end * value not in the set, it is extending the set, so the new first * value not in the set is one greater than the newly extended range. * */ + offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { array[final_element] = end + 1; @@ -7322,7 +7344,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, * just let the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } return; } @@ -7332,16 +7354,18 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end len += 2; /* Includes an element each for the start and end of range */ - /* If overflows the existing space, extend, which may cause the array to be - * moved */ + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ if (max < len) { invlist_extend(invlist, len); - invlist_set_len(invlist, len); /* Have to set len here to avoid assert - failure in invlist_array() */ + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + array = invlist_array(invlist); } else { - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, offset); } /* The next item on the list starts the range, the one after that is @@ -7353,7 +7377,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, just let * the range have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } } @@ -7541,7 +7565,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, @@ -7563,8 +7587,8 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co * return the larger of the input lists, but then outside code might need * to keep track of whether to free the input list or not */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7632,23 +7656,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7744,7 +7762,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* Set result to final length, which can change the pointer to array_u, so * re-find it */ if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u); + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); array_u = invlist_array(u); } @@ -7765,11 +7783,6 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - /* We may be removing a reference to one of the inputs */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); @@ -7781,7 +7794,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, @@ -7798,8 +7811,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * union above */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7864,23 +7877,17 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7971,7 +7978,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r); + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); array_r = invlist_array(r); } @@ -7987,11 +7994,6 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - /* We may be removing a reference to one of the inputs */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); @@ -8060,27 +8062,17 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - UV* len_pos = _get_invlist_len_addr(invlist); - PERL_ARGS_ASSERT__INVLIST_INVERT; assert(! invlist_is_iterating(invlist)); /* The inverse of matching nothing is matching everything */ - if (*len_pos == 0) { + if (_invlist_len(invlist) == 0) { _append_range_to_invlist(invlist, 0, UV_MAX); return; } - /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the - * zero element was a 0, so it is being removed, so the length decrements - * by 1; and vice-versa. SvCUR is unaffected */ - if (*get_invlist_zero_addr(invlist) ^= 1) { - (*len_pos)--; - } - else { - (*len_pos)++; - } + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } void @@ -8110,11 +8102,11 @@ Perl__invlist_invert_prop(pTHX_ SV* const invlist) invlist_extend(invlist, len); array = invlist_array(invlist); } - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); array[len - 1] = PERL_UNICODE_MAX + 1; } else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); } } @@ -8132,17 +8124,19 @@ S_invlist_clone(pTHX_ SV* const invlist) /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); - STRLEN length = SvCUR(invlist); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); PERL_ARGS_ASSERT_INVLIST_CLONE; - SvCUR_set(new_invlist, length); /* This isn't done automatically */ - Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); return new_invlist; } -PERL_STATIC_INLINE UV* +PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(pTHX_ SV* invlist) { /* Return the address of the UV that contains the current iteration @@ -8150,17 +8144,9 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); -} + assert(SvTYPE(invlist) == SVt_INVLIST); -PERL_STATIC_INLINE UV* -S_get_invlist_version_id_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the version id. */ - - PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; - - return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); + return &(((XINVLIST*) SvANY(invlist))->iterator); } PERL_STATIC_INLINE void @@ -8184,7 +8170,7 @@ S_invlist_iterfinish(pTHX_ SV* invlist) PERL_ARGS_ASSERT_INVLIST_ITERFINISH; - *get_invlist_iter_addr(invlist) = UV_MAX; + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; } STATIC bool @@ -8197,14 +8183,14 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) * <*start> and <*end> are unchanged, and the next call to this function * will start over at the beginning of the list */ - UV* pos = get_invlist_iter_addr(invlist); + STRLEN* pos = get_invlist_iter_addr(invlist); UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_ITERNEXT; if (*pos >= len) { - *pos = UV_MAX; /* Force iterinit() to be required next time */ + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ return FALSE; } @@ -8227,7 +8213,7 @@ S_invlist_is_iterating(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - return *(get_invlist_iter_addr(invlist)) < UV_MAX; + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } PERL_STATIC_INLINE UV @@ -8330,14 +8316,14 @@ Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) #if 0 bool -S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { /* Return a boolean as to if the two passed in inversion lists are * identical. The final argument, if TRUE, says to take the complement of * the second inversion list before doing the comparison */ - UV* array_a = invlist_array(a); - UV* array_b = invlist_array(b); + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); UV len_a = _invlist_len(a); UV len_b = _invlist_len(b); @@ -8359,23 +8345,17 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) /* Otherwise, to complement, we invert. Here, the first element is * 0, just remove it. To do this, we just pretend the array starts - * one later, and clear the flag as we don't have to do anything - * else later */ + * one later */ array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -8392,22 +8372,14 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) } } - if (complement_b) { - array_b[0] = 1; - } return retval; } #endif #undef HEADER_LENGTH -#undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE #undef FROM_INTERNAL_SIZE -#undef INVLIST_LEN_OFFSET -#undef INVLIST_ZERO_OFFSET -#undef INVLIST_ITER_OFFSET #undef INVLIST_VERSION_ID -#undef INVLIST_PREVIOUS_INDEX_OFFSET /* End of inversion list object */ @@ -8825,7 +8797,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp |= HASWIDTH; Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); return ret; @@ -9475,7 +9447,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (paren == '>') node = SUSPEND, flag = 0; reginsert(pRExC_state, node,ret, depth+1); - Set_Node_Cur_Length(ret); + Set_Node_Cur_Length(ret, parse_start); Set_Node_Offset(ret, parse_start + 1); ret->flags = flag; REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); @@ -9683,23 +9655,13 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } - else if (max == 0) { /* replace {0} with a nothing node */ - if (SIZE_ONLY) { - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; - } - else { - RExC_emit = orig_emit; - } - ret = reg_node(pRExC_state, NOTHING); - return ret; - } do_curly: if ((flags&SIMPLE)) { RExC_naughty += 2 + RExC_naughty / 2; reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(ret, parse_start+1); /* MJD */ - Set_Node_Cur_Length(ret); + Set_Node_Cur_Length(ret, parse_start); } else { regnode * const w = reg_node(pRExC_state, WHILEM); @@ -9804,9 +9766,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, MINMOD, ret, depth+1); REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); } -#ifndef REG_ALLOW_MINMOD_SUSPEND else -#endif if (RExC_parse < RExC_end && *RExC_parse == '+') { regnode *ender; nextchar(pRExC_state); @@ -9816,7 +9776,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret->flags = 0; ender = reg_node(pRExC_state, TAIL); REGTAIL(pRExC_state, ret, ender); - /*ret= ender;*/ } if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { @@ -10581,7 +10540,7 @@ tryagain: RExC_parse--; Set_Node_Offset(ret, parse_start + 2); - Set_Node_Cur_Length(ret); + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); } break; @@ -10645,7 +10604,7 @@ tryagain: /* override incorrect value set in reganode MJD */ Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); } @@ -10688,11 +10647,13 @@ tryagain: if (num < 1) vFAIL("Reference to nonexistent or unclosed group"); } - if (!isg && num > 9 && num >= RExC_npar) + if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') /* Probably a character specified in octal, e.g. \35 */ goto defchar; else { +#ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ +#endif while (isDIGIT(*RExC_parse)) RExC_parse++; if (hasbrace) { @@ -10720,7 +10681,7 @@ tryagain: /* override incorrect value set in reganode MJD */ Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); RExC_parse--; nextchar(pRExC_state); } @@ -10963,10 +10924,28 @@ tryagain: p++; ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); break; - case '0': case '1': case '2': case '3':case '4': + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': case '5': case '6': case '7': - if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + /* When we parse backslash escapes there is ambiguity between + * backreferences and octal escapes. Any escape from \1 - \9 is + * a backreference, any multi-digit escape which does not start with + * 0 and which when evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else is octal. + * + * Note this implies that \118 could be interpreted as 118 OR as + * "\11" . "8" depending on whether there were 118 capture buffers + * defined already in the pattern. + */ + if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + case '0': { I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; @@ -10985,11 +10964,6 @@ tryagain: form_short_octal_warning(p, numlen)); } } - else { /* Not to be treated as an octal constant, go - find backref */ - --p; - goto loopdone; - } if (PL_encoding && ender < 0x100) goto recode_encoding; break; @@ -11328,12 +11302,6 @@ tryagain: loopdone: /* Jumped to when encounters something that shouldn't be in the node */ - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; - } - /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT * node. Might as well be NOTHING instead */ @@ -11341,11 +11309,17 @@ tryagain: OP(ret) = NOTHING; } else{ + + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds */ + if (FOLD && maybe_exact) { + OP(ret) = EXACT; + } alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); } RExC_parse = p - 1; - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); { /* len is STRLEN which is unsigned, need to copy to signed */ @@ -11661,6 +11635,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f * these things, we need to realize that something preceded by a backslash * is escaped, so we have to keep track of backslashes */ if (SIZE_ONLY) { + UV depth = 0; /* how many nested (?[...]) constructs */ Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), @@ -11672,6 +11647,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_parse = regpatws(pRExC_state, RExC_parse, TRUE); /* means recognize comments */ switch (*RExC_parse) { + case '?': + if (RExC_parse[1] == '[') depth++, RExC_parse++; + /* FALL THROUGH */ default: break; case '\\': @@ -11719,6 +11697,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } case ']': + if (depth--) break; RExC_parse++; if (RExC_parse < RExC_end && *RExC_parse == ')') @@ -11774,7 +11753,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f * been parsed and evaluated to a single operand (or else is a syntax * error), and is handled as a regular operand */ - stack = newAV(); + sv_2mortal((SV *)(stack = newAV())); while (RExC_parse < RExC_end) { I32 top_index = av_tindex(stack); @@ -11946,6 +11925,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f || IS_OPERAND(lparen) || SvUV(lparen) != '(') { + SvREFCNT_dec(current); RExC_parse++; vFAIL("Unexpected ')'"); } @@ -11964,9 +11944,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } else { SV* top = av_pop(stack); + SV *prev = NULL; char current_operator; if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); vFAIL("Operand with no preceding operator"); } current_operator = (char) SvUV(top); @@ -11993,7 +11976,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f goto handle_operand; case '&': - _invlist_intersection(av_pop(stack), + prev = av_pop(stack); + _invlist_intersection(prev, current, ¤t); av_push(stack, current); @@ -12001,12 +11985,14 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f case '|': case '+': - _invlist_union(av_pop(stack), current, ¤t); + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); av_push(stack, current); break; case '-': - _invlist_subtract(av_pop(stack), current, ¤t); + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); av_push(stack, current); break; @@ -12016,9 +12002,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f SV* u = NULL; SV* element; - element = av_pop(stack); - _invlist_union(element, current, &u); - _invlist_intersection(element, current, &i); + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; _invlist_subtract(u, i, ¤t); av_push(stack, current); SvREFCNT_dec_NN(i); @@ -12031,6 +12020,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); } SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); } } @@ -12094,7 +12084,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_end = save_end; SvREFCNT_dec_NN(final); SvREFCNT_dec_NN(result_string); - SvREFCNT_dec_NN(stack); nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -13532,7 +13521,7 @@ parseit: default: /* Use deprecated warning to increase the * chances of this being output */ - ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); break; } } @@ -13714,6 +13703,7 @@ parseit: if (ret_invlist) { *ret_invlist = cp_list; + SvREFCNT_dec(swash); /* Discard the generated node */ if (SIZE_ONLY) { @@ -14415,6 +14405,28 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + for (bit=0; bit<32; bit++) { + if (flags & (1<extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); - DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); #else PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT;