X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/596005989a9f38f57e92b1769576e19a9527c59a..a42823ac3c233f0a9b8aefaac74a3b1e1600e6f6:/regcomp.c diff --git a/regcomp.c b/regcomp.c index c747071..1a2503f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -848,7 +848,7 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define cl_init_zero cl_init /* 'AND' a given class with another one. Can create false positives. 'cl' * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if @@ -6242,7 +6242,7 @@ reStudy: && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen && !(RExC_seen & REG_SEEN_VERBARG) - && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) r->extflags |= RXf_CHECK_ALL; scan_commit(pRExC_state, &data,&minlen,0); @@ -6339,9 +6339,7 @@ reStudy: r->check_offset_min = r->float_min_offset; r->check_offset_max = r->float_max_offset; } - /* XXXX Currently intuiting is not compatible with ANCH_GPOS. - This should be changed ASAP! */ - if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) ) { r->extflags |= RXf_USE_INTUIT; if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; @@ -6728,13 +6726,23 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) - ) - goto ret_undef; + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6840,13 +6848,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6859,8 +6881,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6872,13 +6892,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -7025,9 +7039,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* This section of code defines the inversion list object and its methods. The * interfaces are highly subject to change, so as much as possible is static to - * this file. An inversion list is here implemented as a malloc'd C UV array. - * Currently it is a SVt_PVLV, with some of the header fields from that - * repurposed for uses here. + * this file. An inversion list is here implemented as a malloc'd C UV array + * as an SVt_INVLIST scalar. * * An inversion list for Unicode is an array of code points, sorted by ordinal * number. The zeroth element is the first code point in the list. The 1th @@ -7049,10 +7062,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * list.) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. - * This implementation reserves an element (considered to be the final element - * of the header) at the beginning of each inversion list to always contain 0; - * there is an additional flag in the header which indicates if the list begins - * at the 0, or is offset to begin at the next element. + * This implementation reserves an element at the beginning of each inversion + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -7068,14 +7081,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* The header definitions are in F */ -/* This converts to/from our UVs to what the SV code is expecting: bytes. The - * first element is always a 0, which may or may not currently be in the list. - * Just assume the worst case, that it isn't, and so the length of the list - * passed in has to add 1 to account for it */ -#define TO_INTERNAL_SIZE(x) (((x) + 1) * sizeof(UV)) -#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - 1) - - PERL_STATIC_INLINE UV* S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) { @@ -7087,13 +7092,13 @@ S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) * element is either the element reserved for 0, if TRUE, or the element * after it, if FALSE */ - U8* offset = get_invlist_offset_addr(invlist); + bool* offset = get_invlist_offset_addr(invlist); UV* zero_addr = (UV *) SvPVX(invlist); PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *_get_invlist_len_addr(invlist)); + assert(! _invlist_len(invlist)); *zero_addr = 0; @@ -7113,9 +7118,7 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*_get_invlist_len_addr(invlist)); - assert(*get_invlist_offset_addr(invlist) == 0 - || *get_invlist_offset_addr(invlist) == 1); + assert(_invlist_len(invlist)); /* The very first element always contains zero, The array begins either * there, or if the inversion list is offset, at the element after it. @@ -7126,29 +7129,33 @@ S_invlist_array(pTHX_ SV* const invlist) } PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len) +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { - /* Sets the current number of elements stored in the inversion list */ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *_get_invlist_len_addr(invlist) = len; + assert(SvTYPE(invlist) == SVt_INVLIST); - assert(SvLEN(invlist) == 0 || len <= SvLEN(invlist)); - - SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); - /* Note that when inverting, SvCUR shouldn't change */ + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); } PERL_STATIC_INLINE IV* S_get_invlist_previous_index_addr(pTHX_ SV* invlist) { - /* Return the address of the UV that is reserved to hold the cached index + /* Return the address of the IV that is reserved to hold the cached index * */ PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - return &(((XPVLV*) SvANY(invlist))->xiv_u.xivu_iv); + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); } PERL_STATIC_INLINE IV @@ -7181,20 +7188,13 @@ S_invlist_max(pTHX_ SV* const invlist) PERL_ARGS_ASSERT_INVLIST_MAX; - return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? _invlist_len(invlist) - : FROM_INTERNAL_SIZE(SvLEN(invlist)); -} - -PERL_STATIC_INLINE U8* -S_get_invlist_offset_addr(pTHX_ SV* invlist) -{ - /* Return the address of the field that says whether the inversion list is - * offset (it contains 1) or not (contains 0) */ + assert(SvTYPE(invlist) == SVt_INVLIST); - PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; - - return (U8*) &(LvFLAGS(invlist)); + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; } #ifndef PERL_IN_XSUB_RE @@ -7207,27 +7207,22 @@ Perl__new_invlist(pTHX_ IV initial_size) * system default is used instead */ SV* new_list; - U8* offset_addr; if (initial_size < 0) { initial_size = 10; } /* Allocate the initial space */ - new_list = newSV_type(SVt_PVLV); - SvGROW(new_list, TO_INTERNAL_SIZE(initial_size) + 1); /* 1 is for trailing - NUL */ - invlist_set_len(new_list, 0); + new_list = newSV_type(SVt_INVLIST); + + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); /* Force iterinit() to be used to get iteration to work */ *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; - /* This should force a segfault if a method doesn't initialize this - * properly. XXX Although now that the max is currently just 255, it might - * not segfault. */ - offset_addr = get_invlist_offset_addr(new_list); - *offset_addr = (U8) UV_MAX; - *get_invlist_previous_index_addr(new_list) = 0; return new_list; @@ -7246,19 +7241,19 @@ S__new_invlist_C_array(pTHX_ const UV* const list) const STRLEN length = (STRLEN) list[0]; const UV version_id = list[1]; - const U8 offset = (U8) list[2]; + const bool offset = cBOOL(list[2]); #define HEADER_LENGTH 3 /* If any of the above changes in any way, you must change HEADER_LENGTH * (if appropriate) and regenerate INVLIST_VERSION_ID by running * perl -E 'say int(rand 2**31-1)' */ -#define INVLIST_VERSION_ID 1826693541/* This is a combination of a version and +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and data structure type, so that one being passed in can be validated to be an inversion list of the correct vintage. */ - SV* invlist = newSV_type(SVt_PVLV); + SV* invlist = newSV_type(SVt_INVLIST); PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; @@ -7272,17 +7267,14 @@ S__new_invlist_C_array(pTHX_ const UV* const list) SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - /* The 'length' passed to us is the number of elements in the inversion - * list. If the list doesn't include the always present 0 element, the - * length should be adjusted upwards to include it. The TO_INTERNAL_SIZE - * macro returns a worst case scenario, always making that adjustment, even - * if not needed. To get the precise size, then, we have to subtract 1 - * when that adjustment wasn't needed. That happens when the offset was 0; - * the exclusive-or flips the 0 to 1, and vice versa */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(length - (offset ^ 1))); - - invlist_set_len(invlist, length); + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); + invlist_set_previous_index(invlist, 0); /* Initialize the iteration pointer. */ @@ -7298,7 +7290,11 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); } PERL_STATIC_INLINE void @@ -7306,9 +7302,10 @@ S_invlist_trim(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ - SvPV_shrink_to_cur((SV *) invlist); } @@ -7324,11 +7321,13 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); UV len = _invlist_len(invlist); + bool offset; PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; if (len == 0) { /* Empty lists must be initialized */ - array = _invlist_array_init(invlist, start == 0); + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); } else { /* Here, the existing list is non-empty. The current max entry in the @@ -7351,6 +7350,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end * value not in the set, it is extending the set, so the new first * value not in the set is one greater than the newly extended range. * */ + offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { array[final_element] = end + 1; @@ -7358,7 +7358,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, * just let the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } return; } @@ -7368,16 +7368,18 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end len += 2; /* Includes an element each for the start and end of range */ - /* If overflows the existing space, extend, which may cause the array to be - * moved */ + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ if (max < len) { invlist_extend(invlist, len); - invlist_set_len(invlist, len); /* Have to set len here to avoid assert - failure in invlist_array() */ + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + array = invlist_array(invlist); } else { - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, offset); } /* The next item on the list starts the range, the one after that is @@ -7389,7 +7391,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, just let * the range have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } } @@ -7774,7 +7776,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b /* Set result to final length, which can change the pointer to array_u, so * re-find it */ if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u); + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); array_u = invlist_array(u); } @@ -7848,7 +7850,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = _invlist_len(a); + len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { if (len_a != 0 && complement_b) { @@ -7858,11 +7860,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * must be every possible code point. Thus the intersection is * simply 'a'. */ if (*i != a) { - *i = invlist_clone(a); - if (*i == b) { SvREFCNT_dec_NN(b); } + + *i = invlist_clone(a); } /* else *i is already 'a' */ return; @@ -7990,7 +7992,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r); + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); array_r = invlist_array(r); } @@ -8074,27 +8076,17 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - STRLEN* len_pos = _get_invlist_len_addr(invlist); - PERL_ARGS_ASSERT__INVLIST_INVERT; assert(! invlist_is_iterating(invlist)); /* The inverse of matching nothing is matching everything */ - if (*len_pos == 0) { + if (_invlist_len(invlist) == 0) { _append_range_to_invlist(invlist, 0, UV_MAX); return; } - /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the - * zero element was a 0, so it is being removed, so the length decrements - * by 1; and vice-versa. SvCUR is unaffected */ - if (*get_invlist_offset_addr(invlist) ^= 1) { - (*len_pos)--; - } - else { - (*len_pos)++; - } + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } void @@ -8124,11 +8116,11 @@ Perl__invlist_invert_prop(pTHX_ SV* const invlist) invlist_extend(invlist, len); array = invlist_array(invlist); } - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); array[len - 1] = PERL_UNICODE_MAX + 1; } else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); } } @@ -8146,15 +8138,14 @@ S_invlist_clone(pTHX_ SV* const invlist) /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); - STRLEN length = SvCUR(invlist); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); PERL_ARGS_ASSERT_INVLIST_CLONE; - SvCUR_set(new_invlist, length); /* This isn't done automatically */ - invlist_set_len(new_invlist, _invlist_len(invlist)); - *(get_invlist_offset_addr(new_invlist)) - = *(get_invlist_offset_addr(invlist)); - Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); return new_invlist; } @@ -8167,7 +8158,9 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - return &(LvTARGOFF(invlist)); + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->iterator); } PERL_STATIC_INLINE void @@ -8335,7 +8328,7 @@ Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLISTEQ bool S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { @@ -8377,7 +8370,6 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -13907,8 +13899,6 @@ parseit: for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; } } } @@ -14711,26 +14701,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) ) ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); sv_catpvs(sv, "["); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); } @@ -14774,7 +14748,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { - int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; @@ -14788,32 +14761,19 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "^"); /* output what the standard cp 0-255 bitmap matches */ - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - do_sep = 1; - rangestart = -1; - } - } + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) + if (ANYOF_CLASS_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) { if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } + } + } EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); @@ -14824,91 +14784,62 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_UNICODE_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) - sv_catpvs(sv, "{unicode}"); - if (flags & ANYOF_NONBITMAP_NON_UTF8) - sv_catpvs(sv, "{outside bitmap}"); - - if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); + else if (ANYOF_NONBITMAP(o)) { + SV *lv; /* Set if there is something outside the bit map. */ + SV * sw; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + + /* Get the stuff that wasn't in the bitmap */ + sw = regclass_swash(prog, o, FALSE, &lv, NULL); bool byte_output = FALSE; /* If something in the bitmap has been output */ - if (lv && lv != &PL_sv_undef) { - if (sw) { - U8 s[UTF8_MAXBYTES_CASE+1]; - - for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ - uvchr_to_utf8(s, i); - - if (i < 256 - && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate - things already - output as part - of the bitmap */ - && swash_fetch(sw, s, TRUE)) - { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - byte_output = TRUE; - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - put_byte(sv, rangestart); - } - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i-1); - } - rangestart = -1; - } - } - } - - { - char *s = savesvpv(lv); - char * const origs = s; + char *s = savesvpv(lv); + char * const origs = s; - while (*s && *s != '\n') - s++; + while (*s && *s != '\n') + s++; - if (*s == '\n') { - const char * const t = ++s; + if (*s == '\n') { + const char * const t = ++s; - if (byte_output) { - sv_catpvs(sv, " "); - } + if (byte_output) { + sv_catpvs(sv, " "); + } - while (*s) { - if (*s == '\n') { + while (*s) { + if (*s == '\n') { - /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } - if (s[-1] == ' ') - s[-1] = 0; + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; - sv_catpv(sv, t); - } + sv_catpv(sv, t); + } - out_dump: + out_dump: - Safefree(origs); - } + Safefree(origs); SvREFCNT_dec_NN(lv); } } @@ -15305,7 +15236,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) so we need to copy it locally. */ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); ret->mother_re = NULL; - ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -15538,12 +15468,17 @@ S_put_byte(pTHX_ SV *sv, int c) So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { - if (c < 256) { - Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); - } + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } } else { const char string = c; @@ -15553,6 +15488,63 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + int rangestart = -1; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i <= 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + int j = i - 1; + if (i <= rangestart + 3) { /* Individual chars in short ranges */ + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + } + else if ( j > 255 + || ! isALPHANUMERIC(rangestart) + || ! isALPHANUMERIC(j) + || isDIGIT(rangestart) != isDIGIT(j) + || isUPPER(rangestart) != isUPPER(j) + || isLOWER(rangestart) != isLOWER(j) + + /* This final test should get optimized out except + * on EBCDIC platforms, where it causes ranges that + * cross discontinuities like i/j to be shown as hex + * instead of the misleading, e.g. H-K (since that + * range includes more than H, I, J, K). */ + || (j - rangestart) + != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}", + rangestart, + (j < 256) ? j : 255); + } + else { /* Here, the ends of the range are both digits, or both + uppercase, or both lowercase; and there's no + discontinuity in the range (which could happen on EBCDIC + platforms) */ + put_byte(sv, rangestart); + sv_catpvs(sv, "-"); + put_byte(sv, j); + } + rangestart = -1; + has_output_anything = TRUE; + } + } + + return has_output_anything; +} #define CLEAR_OPTSTART \ if (optstart) STMT_START { \