X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/779fedd7c3021f013726c8f53cb9e66c54637ebf..11d7de88d35184d288f54028ef5ae07962378fb1:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 7293a57..659d51f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -126,7 +126,9 @@ typedef struct RExC_state_t { I32 whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ regnode *emit_bound; /* First regnode outside of the allocated space */ - regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ + regnode *emit; /* Code-emit pointer; if = &emit_dummy, + implies compiling, so don't emit */ + regnode emit_dummy; /* placeholder for emit to point to */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; @@ -187,6 +189,7 @@ typedef struct RExC_state_t { #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ #endif #define RExC_emit (pRExC_state->emit) +#define RExC_emit_dummy (pRExC_state->emit_dummy) #define RExC_emit_start (pRExC_state->emit_start) #define RExC_emit_bound (pRExC_state->emit_bound) #define RExC_naughty (pRExC_state->naughty) @@ -364,6 +367,24 @@ typedef struct scan_data_t { struct regnode_charclass_class *start_class; } scan_data_t; +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + /* * Forward declarations for pregcomp()'s friends. */ @@ -402,6 +423,7 @@ static const scan_data_t zero_scan_data = #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ #define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 #define UTF cBOOL(RExC_utf8) @@ -570,9 +592,9 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define ckWARN2regdep(loc,m, a1) STMT_START { \ +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -631,7 +653,7 @@ static const scan_data_t zero_scan_data = #define Set_Cur_Node_Offset #define Set_Node_Length_To_R(node,len) #define Set_Node_Length(node,len) -#define Set_Node_Cur_Length(node) +#define Set_Node_Cur_Length(node,start) #define Node_Offset(n) #define Node_Length(n) #define Set_Node_Offset_Length(node,offset,len) @@ -670,9 +692,8 @@ static const scan_data_t zero_scan_data = #define Set_Node_Length(node,len) \ Set_Node_Length_To_R((node)-RExC_emit_start, len) -#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) -#define Set_Node_Cur_Length(node) \ - Set_Node_Length(node, RExC_parse - parse_start) +#define Set_Node_Cur_Length(node, start) \ + Set_Node_Length(node, RExC_parse - start) /* Get offsets and lengths */ #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) @@ -845,7 +866,7 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define cl_init_zero cl_init /* 'AND' a given class with another one. Can create false positives. 'cl' * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if @@ -2935,16 +2956,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * have to find at least two characters for a multi-fold */ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - /* The below is perhaps overboard, but this allows us to save a - * test each time through the loop at the expense of a mask. This - * is because on both EBCDIC and ASCII machines, 'S' and 's' differ - * by a single bit. On ASCII they are 32 apart; on EBCDIC, they - * are 64. This uses an exclusive 'or' to find that bit and then - * inverts it to form a mask, with just a single 0, in the bit - * position where 'S' and 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; - while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ @@ -2957,8 +2968,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } if (len == 2 - && ((*s & S_or_s_mask) == s_masked) - && ((*(s+1) & S_or_s_mask) == s_masked)) + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) { /* EXACTF nodes need to know that the minimum length @@ -3907,8 +3918,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if ( /* ? quantifier ok, except for (?{ ... }) */ - (next_is_eval || !(mincount == 0 && maxcount == 1)) + if (!(flags & SCF_TRIE_DOING_RESTUDY) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ @@ -5114,16 +5126,15 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * sv_catsv_nomg(pat, msv); * that allows us to adjust code block indices if * needed */ - STRLEN slen, dlen; + STRLEN dlen; char *dst = SvPV_force_nomg(pat, dlen); - const char *src = SvPV_flags_const(msv, slen, 0); orig_patlen = dlen; if (SvUTF8(msv) && !SvUTF8(pat)) { S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); sv_setpvn(pat, dst, dlen); SvUTF8_on(pat); } - sv_catpvn_nomg(pat, src, slen); + sv_catsv_nomg(pat, msv); rx = msv; } else @@ -5518,6 +5529,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 sawlookahead = 0; I32 sawplus = 0; I32 sawopen = 0; + I32 sawminmod = 0; + regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool recompile = 0; bool runtime_code = 0; @@ -5796,7 +5809,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &PL_regdummy; + RExC_emit = &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -6026,7 +6039,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } reStudy: - r->minlen = minlen = sawlookahead = sawplus = sawopen = 0; + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT @@ -6095,12 +6108,15 @@ reStudy: * the only op that could be a regnode is PLUS, all the rest * will be regnode_1 or regnode_2. * + * (yves doesn't think this is true) */ if (OP(first) == PLUS) sawplus = 1; - else + else { + if (OP(first) == MINMOD) + sawminmod = 1; first += regarglen[OP(first)]; - + } first = NEXTOPER(first); first_next= regnext(first); } @@ -6171,7 +6187,7 @@ reStudy: first = NEXTOPER(first); goto again; } - if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6222,7 +6238,9 @@ reStudy: minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ &data, -1, NULL, NULL, - SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag + | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); @@ -6232,7 +6250,7 @@ reStudy: && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen && !(RExC_seen & REG_SEEN_VERBARG) - && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) r->extflags |= RXf_CHECK_ALL; scan_commit(pRExC_state, &data,&minlen,0); @@ -6329,9 +6347,7 @@ reStudy: r->check_offset_min = r->float_min_offset; r->check_offset_max = r->float_max_offset; } - /* XXXX Currently intuiting is not compatible with ANCH_GPOS. - This should be changed ASAP! */ - if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) ) { r->extflags |= RXf_USE_INTUIT; if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; @@ -6358,7 +6374,10 @@ reStudy: minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); + &data, -1, NULL, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS + |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); CHECK_RESTUDY_GOTO_butfirst(NOOP); @@ -6715,13 +6734,23 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) - ) - goto ret_undef; + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6827,13 +6856,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6846,8 +6889,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6859,13 +6900,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -7013,11 +7048,12 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* This section of code defines the inversion list object and its methods. The * interfaces are highly subject to change, so as much as possible is static to * this file. An inversion list is here implemented as a malloc'd C UV array - * with some added info that is placed as UVs at the beginning in a header - * portion. An inversion list for Unicode is an array of code points, sorted - * by ordinal number. The zeroth element is the first code point in the list. - * The 1th element is the first element beyond that not in the list. In other - * words, the first range is + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is * invlist[0]..(invlist[1]-1) * The other ranges follow. Thus every element whose index is divisible by two * marks the beginning of a range that is in the list, and every element not @@ -7035,9 +7071,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. * This implementation reserves an element at the beginning of each inversion - * list to contain 0 when the list contains 0, and contains 1 otherwise. The - * actual beginning of the list is either that element if 0, or the next one if - * 1. + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -7052,32 +7088,31 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * should eventually be made public */ /* The header definitions are in F */ -#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV)) -#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH) - -#define INVLIST_INITIAL_LEN 10 PERL_STATIC_INLINE UV* S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) { /* Returns a pointer to the first element in the inversion list's array. * This is called upon initialization of an inversion list. Where the - * array begins depends on whether the list has the code point U+0000 - * in it or not. The other parameter tells it whether the code that - * follows this call is about to put a 0 in the inversion list or not. - * The first element is either the element with 0, if 0, or the next one, - * if 1 */ + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ - UV* zero = get_invlist_zero_addr(invlist); + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *_get_invlist_len_addr(invlist)); + assert(! _invlist_len(invlist)); + + *zero_addr = 0; /* 1^1 = 0; 1^0 = 1 */ - *zero = 1 ^ will_have_0; - return zero + *zero; + *offset = 1 ^ will_have_0; + return zero_addr + *offset; } PERL_STATIC_INLINE UV* @@ -7091,53 +7126,44 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*_get_invlist_len_addr(invlist)); - assert(*get_invlist_zero_addr(invlist) == 0 - || *get_invlist_zero_addr(invlist) == 1); - - /* The array begins either at the element reserved for zero if the - * list contains 0 (that element will be set to 0), or otherwise the next - * element (in which case the reserved element will be set to 1). */ - return (UV *) (get_invlist_zero_addr(invlist) - + *get_invlist_zero_addr(invlist)); + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); } PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len) +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { - /* Sets the current number of elements stored in the inversion list */ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *_get_invlist_len_addr(invlist) = len; - - assert(len <= SvLEN(invlist)); - - SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); - /* If the list contains U+0000, that element is part of the header, - * and should not be counted as part of the array. It will contain - * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and - * subtract: - * SvCUR_set(invlist, - * TO_INTERNAL_SIZE(len - * - (*get_invlist_zero_addr(inv_list) ^ 1))); - * But, this is only valid if len is not 0. The consequences of not doing - * this is that the memory allocation code may think that 1 more UV is - * being used than actually is, and so might do an unnecessary grow. That - * seems worth not bothering to make this the precise amount. - * - * Note that when inverting, SvCUR shouldn't change */ + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); } PERL_STATIC_INLINE IV* S_get_invlist_previous_index_addr(pTHX_ SV* invlist) { - /* Return the address of the UV that is reserved to hold the cached index + /* Return the address of the IV that is reserved to hold the cached index * */ PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV))); + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); } PERL_STATIC_INLINE IV @@ -7170,22 +7196,13 @@ S_invlist_max(pTHX_ SV* const invlist) PERL_ARGS_ASSERT_INVLIST_MAX; - return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? _invlist_len(invlist) - : FROM_INTERNAL_SIZE(SvLEN(invlist)); -} - -PERL_STATIC_INLINE UV* -S_get_invlist_zero_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that is reserved to hold 0 if the inversion - * list contains 0. This has to be the last element of the heading, as the - * list proper starts with either it if 0, or the next element if not. - * (But we force it to contain either 0 or 1) */ - - PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); - return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV))); + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; } #ifndef PERL_IN_XSUB_RE @@ -7200,56 +7217,75 @@ Perl__new_invlist(pTHX_ IV initial_size) SV* new_list; if (initial_size < 0) { - initial_size = INVLIST_INITIAL_LEN; + initial_size = 10; } /* Allocate the initial space */ - new_list = newSV(TO_INTERNAL_SIZE(initial_size)); - invlist_set_len(new_list, 0); + new_list = newSV_type(SVt_INVLIST); - /* Force iterinit() to be used to get iteration to work */ - *get_invlist_iter_addr(new_list) = UV_MAX; + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); - /* This should force a segfault if a method doesn't initialize this - * properly */ - *get_invlist_zero_addr(new_list) = UV_MAX; + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; *get_invlist_previous_index_addr(new_list) = 0; - *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; -#if HEADER_LENGTH != 5 -# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length -#endif return new_list; } #endif STATIC SV* -S__new_invlist_C_array(pTHX_ UV* list) +S__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list * form, including internal fields. Thus this is a dangerous routine that - * should not be used in the wrong hands */ + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ - SV* invlist = newSV_type(SVt_PV); + SV* invlist = newSV_type(SVt_INVLIST); PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - SvPV_set(invlist, (char *) list); + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist))); - if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { - Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); - } + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); - /* Initialize the iteration pointer. - * XXX This could be done at compile time in charclass_invlists.h, but I - * (khw) am not confident that the suffixes for specifying the C constant - * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured - * to use 64 bits; might need a Configure probe */ + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); return invlist; @@ -7262,7 +7298,11 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); } PERL_STATIC_INLINE void @@ -7270,9 +7310,10 @@ S_invlist_trim(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ - SvPV_shrink_to_cur((SV *) invlist); } @@ -7288,11 +7329,13 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); UV len = _invlist_len(invlist); + bool offset; PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; if (len == 0) { /* Empty lists must be initialized */ - array = _invlist_array_init(invlist, start == 0); + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); } else { /* Here, the existing list is non-empty. The current max entry in the @@ -7315,6 +7358,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end * value not in the set, it is extending the set, so the new first * value not in the set is one greater than the newly extended range. * */ + offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { array[final_element] = end + 1; @@ -7322,7 +7366,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, * just let the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } return; } @@ -7332,16 +7376,18 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end len += 2; /* Includes an element each for the start and end of range */ - /* If overflows the existing space, extend, which may cause the array to be - * moved */ + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ if (max < len) { invlist_extend(invlist, len); - invlist_set_len(invlist, len); /* Have to set len here to avoid assert - failure in invlist_array() */ + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + array = invlist_array(invlist); } else { - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, offset); } /* The next item on the list starts the range, the one after that is @@ -7353,7 +7399,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, just let * the range have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } } @@ -7541,7 +7587,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, @@ -7563,8 +7609,8 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co * return the larger of the input lists, but then outside code might need * to keep track of whether to free the input list or not */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7632,23 +7678,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7744,7 +7784,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* Set result to final length, which can change the pointer to array_u, so * re-find it */ if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u); + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); array_u = invlist_array(u); } @@ -7765,11 +7805,6 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - /* We may be removing a reference to one of the inputs */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); @@ -7781,7 +7816,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, @@ -7798,8 +7833,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * union above */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7823,7 +7858,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = _invlist_len(a); + len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { if (len_a != 0 && complement_b) { @@ -7833,11 +7868,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * must be every possible code point. Thus the intersection is * simply 'a'. */ if (*i != a) { - *i = invlist_clone(a); - if (*i == b) { SvREFCNT_dec_NN(b); } + + *i = invlist_clone(a); } /* else *i is already 'a' */ return; @@ -7864,23 +7899,17 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7971,7 +8000,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r); + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); array_r = invlist_array(r); } @@ -7987,11 +8016,6 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - /* We may be removing a reference to one of the inputs */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); @@ -8060,27 +8084,17 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - UV* len_pos = _get_invlist_len_addr(invlist); - PERL_ARGS_ASSERT__INVLIST_INVERT; assert(! invlist_is_iterating(invlist)); /* The inverse of matching nothing is matching everything */ - if (*len_pos == 0) { + if (_invlist_len(invlist) == 0) { _append_range_to_invlist(invlist, 0, UV_MAX); return; } - /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the - * zero element was a 0, so it is being removed, so the length decrements - * by 1; and vice-versa. SvCUR is unaffected */ - if (*get_invlist_zero_addr(invlist) ^= 1) { - (*len_pos)--; - } - else { - (*len_pos)++; - } + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } void @@ -8110,11 +8124,11 @@ Perl__invlist_invert_prop(pTHX_ SV* const invlist) invlist_extend(invlist, len); array = invlist_array(invlist); } - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); array[len - 1] = PERL_UNICODE_MAX + 1; } else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); } } @@ -8132,17 +8146,19 @@ S_invlist_clone(pTHX_ SV* const invlist) /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); - STRLEN length = SvCUR(invlist); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); PERL_ARGS_ASSERT_INVLIST_CLONE; - SvCUR_set(new_invlist, length); /* This isn't done automatically */ - Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); return new_invlist; } -PERL_STATIC_INLINE UV* +PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(pTHX_ SV* invlist) { /* Return the address of the UV that contains the current iteration @@ -8150,17 +8166,9 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV* -S_get_invlist_version_id_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the version id. */ - - PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); - return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); + return &(((XINVLIST*) SvANY(invlist))->iterator); } PERL_STATIC_INLINE void @@ -8184,7 +8192,7 @@ S_invlist_iterfinish(pTHX_ SV* invlist) PERL_ARGS_ASSERT_INVLIST_ITERFINISH; - *get_invlist_iter_addr(invlist) = UV_MAX; + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; } STATIC bool @@ -8197,14 +8205,14 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) * <*start> and <*end> are unchanged, and the next call to this function * will start over at the beginning of the list */ - UV* pos = get_invlist_iter_addr(invlist); + STRLEN* pos = get_invlist_iter_addr(invlist); UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_ITERNEXT; if (*pos >= len) { - *pos = UV_MAX; /* Force iterinit() to be required next time */ + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ return FALSE; } @@ -8227,7 +8235,7 @@ S_invlist_is_iterating(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - return *(get_invlist_iter_addr(invlist)) < UV_MAX; + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } PERL_STATIC_INLINE UV @@ -8293,51 +8301,67 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP +#ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist) { - /* Dumps out the ranges in an inversion list. The string 'header' - * if present is output on a line before the first range */ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ UV start, end; + STRLEN count = 0; PERL_ARGS_ASSERT__INVLIST_DUMP; - if (header && strlen(header)) { - PerlIO_printf(Perl_debug_log, "%s\n", header); - } if (invlist_is_iterating(invlist)) { - PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n"); + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); return; } invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); } else if (end != start) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", - start, end); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); } + count += 2; } } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLISTEQ bool -S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { /* Return a boolean as to if the two passed in inversion lists are * identical. The final argument, if TRUE, says to take the complement of * the second inversion list before doing the comparison */ - UV* array_a = invlist_array(a); - UV* array_b = invlist_array(b); + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); UV len_a = _invlist_len(a); UV len_b = _invlist_len(b); @@ -8359,23 +8383,17 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) /* Otherwise, to complement, we invert. Here, the first element is * 0, just remove it. To do this, we just pretend the array starts - * one later, and clear the flag as we don't have to do anything - * else later */ + * one later */ array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -8392,22 +8410,14 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) } } - if (complement_b) { - array_b[0] = 1; - } return retval; } #endif #undef HEADER_LENGTH -#undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE #undef FROM_INTERNAL_SIZE -#undef INVLIST_LEN_OFFSET -#undef INVLIST_ZERO_OFFSET -#undef INVLIST_ITER_OFFSET #undef INVLIST_VERSION_ID -#undef INVLIST_PREVIOUS_INDEX_OFFSET /* End of inversion list object */ @@ -8542,6 +8552,7 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ vWARN5( RExC_parse + 1, "Useless (%s%c) - %suse /%c modifier", @@ -8558,6 +8569,7 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ vWARN3( RExC_parse + 1, "Useless (%sc) - %suse /gc modifier", @@ -8823,7 +8835,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp |= HASWIDTH; Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); return ret; @@ -9473,7 +9485,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (paren == '>') node = SUSPEND, flag = 0; reginsert(pRExC_state, node,ret, depth+1); - Set_Node_Cur_Length(ret); + Set_Node_Cur_Length(ret, parse_start); Set_Node_Offset(ret, parse_start + 1); ret->flags = flag; REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); @@ -9681,23 +9693,13 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } - else if (max == 0) { /* replace {0} with a nothing node */ - if (SIZE_ONLY) { - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; - } - else { - RExC_emit = orig_emit; - } - ret = reg_node(pRExC_state, NOTHING); - return ret; - } do_curly: if ((flags&SIMPLE)) { RExC_naughty += 2 + RExC_naughty / 2; reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(ret, parse_start+1); /* MJD */ - Set_Node_Cur_Length(ret); + Set_Node_Cur_Length(ret, parse_start); } else { regnode * const w = reg_node(pRExC_state, WHILEM); @@ -9802,9 +9804,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, MINMOD, ret, depth+1); REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); } -#ifndef REG_ALLOW_MINMOD_SUSPEND else -#endif if (RExC_parse < RExC_end && *RExC_parse == '+') { regnode *ender; nextchar(pRExC_state); @@ -9814,7 +9814,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret->flags = 0; ender = reg_node(pRExC_state, TAIL); REGTAIL(pRExC_state, ret, ender); - /*ret= ender;*/ } if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { @@ -10579,7 +10578,7 @@ tryagain: RExC_parse--; Set_Node_Offset(ret, parse_start + 2); - Set_Node_Cur_Length(ret); + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); } break; @@ -10643,7 +10642,7 @@ tryagain: /* override incorrect value set in reganode MJD */ Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); } @@ -10686,11 +10685,13 @@ tryagain: if (num < 1) vFAIL("Reference to nonexistent or unclosed group"); } - if (!isg && num > 9 && num >= RExC_npar) + if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') /* Probably a character specified in octal, e.g. \35 */ goto defchar; else { +#ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ +#endif while (isDIGIT(*RExC_parse)) RExC_parse++; if (hasbrace) { @@ -10718,7 +10719,7 @@ tryagain: /* override incorrect value set in reganode MJD */ Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); RExC_parse--; nextchar(pRExC_state); } @@ -10751,7 +10752,7 @@ tryagain: defchar: { STRLEN len = 0; - UV ender; + UV ender = 0; char *p; char *s; #define MAX_NODE_STRING_SIZE 127 @@ -10759,17 +10760,22 @@ tryagain: char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; STRLEN foldlen; - U8 node_type; + U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. (We don't need to figure this out until + * pass 2) */ + bool maybe_exactfu = node_type == EXACTF && PASS2; + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ bool maybe_exact; - ender = 0; - node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -10782,8 +10788,8 @@ tryagain: /* We do the EXACTFish to EXACT node only if folding, and not if in * locale, as whether a character folds or not isn't known until - * runtime */ - maybe_exact = FOLD && ! LOC; + * runtime. (And we don't need to figure this out until pass 2) */ + maybe_exact = FOLD && ! LOC && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10961,10 +10967,28 @@ tryagain: p++; ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); break; - case '0': case '1': case '2': case '3':case '4': + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': case '5': case '6': case '7': - if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + /* When we parse backslash escapes there is ambiguity between + * backreferences and octal escapes. Any escape from \1 - \9 is + * a backreference, any multi-digit escape which does not start with + * 0 and which when evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else is octal. + * + * Note this implies that \118 could be interpreted as 118 OR as + * "\11" . "8" depending on whether there were 118 capture buffers + * defined already in the pattern. + */ + if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + case '0': { I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; @@ -10983,11 +11007,6 @@ tryagain: form_short_octal_warning(p, numlen)); } } - else { /* Not to be treated as an octal constant, go - find backref */ - --p; - goto loopdone; - } if (PL_encoding && ender < 0x100) goto recode_encoding; break; @@ -11019,7 +11038,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED - && ckWARN(WARN_DEPRECATED) + && ckWARN_d(WARN_DEPRECATED) && is_PATWS_non_low(p, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), @@ -11084,8 +11103,23 @@ tryagain: || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', *(s-1))))) + { + maybe_exactfu = FALSE; + } + } *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); } else { /* UTF */ @@ -11274,6 +11308,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11326,24 +11369,31 @@ tryagain: loopdone: /* Jumped to when encounters something that shouldn't be in the node */ - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; - } - /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT * node. Might as well be NOTHING instead */ if (len == 0) { OP(ret) = NOTHING; } - else{ + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } + } alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); } RExC_parse = p - 1; - Set_Node_Cur_Length(ret); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); nextchar(pRExC_state); { /* len is STRLEN which is unsigned, need to copy to signed */ @@ -11659,6 +11709,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f * these things, we need to realize that something preceded by a backslash * is escaped, so we have to keep track of backslashes */ if (SIZE_ONLY) { + UV depth = 0; /* how many nested (?[...]) constructs */ Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), @@ -11670,6 +11721,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_parse = regpatws(pRExC_state, RExC_parse, TRUE); /* means recognize comments */ switch (*RExC_parse) { + case '?': + if (RExC_parse[1] == '[') depth++, RExC_parse++; + /* FALL THROUGH */ default: break; case '\\': @@ -11717,6 +11771,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } case ']': + if (depth--) break; RExC_parse++; if (RExC_parse < RExC_end && *RExC_parse == ')') @@ -11772,7 +11827,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f * been parsed and evaluated to a single operand (or else is a syntax * error), and is handled as a regular operand */ - stack = newAV(); + sv_2mortal((SV *)(stack = newAV())); while (RExC_parse < RExC_end) { I32 top_index = av_tindex(stack); @@ -11944,6 +11999,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f || IS_OPERAND(lparen) || SvUV(lparen) != '(') { + SvREFCNT_dec(current); RExC_parse++; vFAIL("Unexpected ')'"); } @@ -11962,9 +12018,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } else { SV* top = av_pop(stack); + SV *prev = NULL; char current_operator; if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); vFAIL("Operand with no preceding operator"); } current_operator = (char) SvUV(top); @@ -11991,7 +12050,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f goto handle_operand; case '&': - _invlist_intersection(av_pop(stack), + prev = av_pop(stack); + _invlist_intersection(prev, current, ¤t); av_push(stack, current); @@ -11999,12 +12059,14 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f case '|': case '+': - _invlist_union(av_pop(stack), current, ¤t); + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); av_push(stack, current); break; case '-': - _invlist_subtract(av_pop(stack), current, ¤t); + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); av_push(stack, current); break; @@ -12014,9 +12076,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f SV* u = NULL; SV* element; - element = av_pop(stack); - _invlist_union(element, current, &u); - _invlist_intersection(element, current, &i); + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; _invlist_subtract(u, i, ¤t); av_push(stack, current); SvREFCNT_dec_NN(i); @@ -12029,6 +12094,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); } SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); } } @@ -12092,7 +12158,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_end = save_end; SvREFCNT_dec_NN(final); SvREFCNT_dec_NN(result_string); - SvREFCNT_dec_NN(stack); nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -13530,7 +13595,7 @@ parseit: default: /* Use deprecated warning to increase the * chances of this being output */ - ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); break; } } @@ -13712,6 +13777,7 @@ parseit: if (ret_invlist) { *ret_invlist = cp_list; + SvREFCNT_dec(swash); /* Discard the generated node */ if (SIZE_ONLY) { @@ -13893,8 +13959,6 @@ parseit: for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; } } } @@ -14413,6 +14477,28 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + for (bit=0; bit<32; bit++) { + if (flags & (1<extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); - DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); #else PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; @@ -14672,26 +14761,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) ) ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); sv_catpvs(sv, "["); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); } @@ -14735,7 +14808,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { - int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; @@ -14749,32 +14821,19 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "^"); /* output what the standard cp 0-255 bitmap matches */ - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - do_sep = 1; - rangestart = -1; - } - } + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) + if (ANYOF_CLASS_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) { if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } + } + } EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); @@ -14785,91 +14844,62 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_UNICODE_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) - sv_catpvs(sv, "{unicode}"); - if (flags & ANYOF_NONBITMAP_NON_UTF8) - sv_catpvs(sv, "{outside bitmap}"); - - if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); + else if (ANYOF_NONBITMAP(o)) { + SV *lv; /* Set if there is something outside the bit map. */ + SV * sw; bool byte_output = FALSE; /* If something in the bitmap has been output */ - if (lv && lv != &PL_sv_undef) { - if (sw) { - U8 s[UTF8_MAXBYTES_CASE+1]; - - for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ - uvchr_to_utf8(s, i); - - if (i < 256 - && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate - things already - output as part - of the bitmap */ - && swash_fetch(sw, s, TRUE)) - { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - byte_output = TRUE; - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - put_byte(sv, rangestart); - } - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i-1); - } - rangestart = -1; - } - } - } + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } - { - char *s = savesvpv(lv); - char * const origs = s; + /* Get the stuff that wasn't in the bitmap */ + sw = regclass_swash(prog, o, FALSE, &lv, NULL); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; - while (*s && *s != '\n') - s++; + while (*s && *s != '\n') + s++; - if (*s == '\n') { - const char * const t = ++s; + if (*s == '\n') { + const char * const t = ++s; - if (byte_output) { - sv_catpvs(sv, " "); - } + if (byte_output) { + sv_catpvs(sv, " "); + } - while (*s) { - if (*s == '\n') { + while (*s) { + if (*s == '\n') { - /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } - if (s[-1] == ' ') - s[-1] = 0; + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; - sv_catpv(sv, t); - } + sv_catpv(sv, t); + } - out_dump: + out_dump: - Safefree(origs); - } + Safefree(origs); SvREFCNT_dec_NN(lv); } } @@ -15266,7 +15296,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) so we need to copy it locally. */ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); ret->mother_re = NULL; - ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -15456,29 +15485,6 @@ Perl_save_re_context(pTHX) { dVAR; - struct re_save_state *state; - - SAVEVPTR(PL_curcop); - SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1); - - state = (struct re_save_state *)(PL_savestack + PL_savestack_ix); - PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE; - SSPUSHUV(SAVEt_RE_STATE); - - Copy(&PL_reg_state, state, 1, struct re_save_state); - - PL_reg_oldsaved = NULL; - PL_reg_oldsavedlen = 0; - PL_reg_oldsavedoffset = 0; - PL_reg_oldsavedcoffset = 0; - PL_reg_maxiter = 0; - PL_reg_leftiter = 0; - PL_reg_poscache = NULL; - PL_reg_poscache_size = 0; -#ifdef PERL_ANY_COW - PL_nrs = NULL; -#endif - /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ if (PL_curpm) { const REGEXP * const rx = PM_GETRE(PL_curpm); @@ -15522,12 +15528,17 @@ S_put_byte(pTHX_ SV *sv, int c) So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { - if (c < 256) { - Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); - } + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } } else { const char string = c; @@ -15537,6 +15548,63 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + int rangestart = -1; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i <= 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + int j = i - 1; + if (i <= rangestart + 3) { /* Individual chars in short ranges */ + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + } + else if ( j > 255 + || ! isALPHANUMERIC(rangestart) + || ! isALPHANUMERIC(j) + || isDIGIT(rangestart) != isDIGIT(j) + || isUPPER(rangestart) != isUPPER(j) + || isLOWER(rangestart) != isLOWER(j) + + /* This final test should get optimized out except + * on EBCDIC platforms, where it causes ranges that + * cross discontinuities like i/j to be shown as hex + * instead of the misleading, e.g. H-K (since that + * range includes more than H, I, J, K). */ + || (j - rangestart) + != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}", + rangestart, + (j < 256) ? j : 255); + } + else { /* Here, the ends of the range are both digits, or both + uppercase, or both lowercase; and there's no + discontinuity in the range (which could happen on EBCDIC + platforms) */ + put_byte(sv, rangestart); + sv_catpvs(sv, "-"); + put_byte(sv, j); + } + rangestart = -1; + has_output_anything = TRUE; + } + } + + return has_output_anything; +} #define CLEAR_OPTSTART \ if (optstart) STMT_START { \