X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/8e8a446824eed109a7c437ac4a417de07db94cc4..5d288d736c2758c27a5943647f4a524f0e93a642:/regcomp.c diff --git a/regcomp.c b/regcomp.c index afe6c39..be9c184 100644 --- a/regcomp.c +++ b/regcomp.c @@ -570,80 +570,85 @@ static const scan_data_t zero_scan_data = REPORT_LOCATION_ARGS(offset)); \ } STMT_END +/* These have asserts in them because of [perl #122671] Many warnings in + * regcomp.c can occur twice. If they get output in pass1 and later in that + * pass, the pattern has to be converted to UTF-8 and the pass restarted, they + * would get output again. So they should be output in pass2, and these + * asserts make sure new warnings follow that paradigm. */ /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -800,6 +805,33 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); +#ifdef DEBUGGING + +/* is c a control character for which we have a mnemonic? */ +#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) + +STATIC const char * +S_cntrl_to_mnemonic(const U8 c) +{ + /* Returns the mnemonic string that represents character 'c', if one + * exists; NULL otherwise. The only ones that exist for the purposes of + * this routine are a few control characters */ + + switch (c) { + case '\a': return "\\a"; + case '\b': return "\\b"; + case ESC_NATIVE: return "\\e"; + case '\f': return "\\f"; + case '\n': return "\\n"; + case '\r': return "\\r"; + case '\t': return "\\t"; + } + + return NULL; +} + +#endif + /* Mark that we cannot extend a found fixed substring at this point. Update the longest found anchored substring and the longest found floating substrings if needed. */ @@ -1062,7 +1094,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* Similarly for these */ if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { - invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); } if (ANYOF_FLAGS(node) & ANYOF_INVERT) { @@ -5638,9 +5670,9 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, char **pat_p, STRLEN *plen_p, int num_code_blocks) { U8 *const src = (U8*)*pat_p; - U8 *dst; + U8 *dst, *d; int n=0; - STRLEN s = 0, d = 0; + STRLEN s = 0; bool do_end = 0; GET_RE_DEBUG_FLAGS_DECL; @@ -5648,32 +5680,27 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); Newx(dst, *plen_p * 2 + 1, U8); + d = dst; while (s < *plen_p) { - if (NATIVE_BYTE_IS_INVARIANT(src[s])) - dst[d] = src[s]; - else { - dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); - dst[d] = UTF8_EIGHT_BIT_LO(src[s]); - } + append_utf8_from_native_byte(src[s], &d); if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { - pRExC_state->code_blocks[n].start = d; - assert(dst[d] == '('); + pRExC_state->code_blocks[n].start = d - dst - 1; + assert(*(d - 1) == '('); do_end = 1; } else if (do_end && pRExC_state->code_blocks[n].end == s) { - pRExC_state->code_blocks[n].end = d; - assert(dst[d] == ')'); + pRExC_state->code_blocks[n].end = d - dst - 1; + assert(*(d - 1) == ')'); do_end = 0; n++; } } s++; - d++; } - dst[d] = '\0'; - *plen_p = d; + *d = '\0'; + *plen_p = d - dst; *pat_p = (char*) dst; SAVEFREEPV(*pat_p); RExC_orig_utf8 = RExC_utf8 = 1; @@ -6027,7 +6054,6 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, ENTER; SAVETMPS; - save_re_context(); PUSHSTACKi(PERLSI_REQUIRE); /* G_RE_REPARSING causes the toker to collapse \\ into \ when * parsing qr''; normally only q'' does this. It also alters @@ -6276,6 +6302,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + + /* This is calculated here, because the Perl program that generates the + * static global ones doesn't currently have access to + * NUM_ANYOF_CODE_POINTS */ + PL_InBitmap = _new_invlist(2); + PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, + NUM_ANYOF_CODE_POINTS - 1); } #endif @@ -6838,9 +6871,7 @@ reStudy: else if (PL_regkind[OP(first)] == BOL) { r->intflags |= (OP(first) == MBOL ? PREGf_ANCH_MBOL - : (OP(first) == SBOL - ? PREGf_ANCH_SBOL - : PREGf_ANCH_BOL)); + : PREGf_ANCH_SBOL); first = NEXTOPER(first); goto again; } @@ -7106,8 +7137,8 @@ reStudy: /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", - (IV)minlen, (IV)r->minlen, RExC_maxlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n", + (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) @@ -7157,7 +7188,12 @@ reStudy: if (PL_regkind[fop] == NOTHING && nop == END) r->extflags |= RXf_NULL; - else if (PL_regkind[fop] == BOL && nop == END) + else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END) + /* when fop is SBOL first->flags will be true only when it was + * produced by parsing /\A/, and not when parsing /^/. This is + * very important for the split code as there we want to + * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. + * See rt #122761 for more details. -- Yves */ r->extflags |= RXf_START_ONLY; else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE @@ -9346,7 +9382,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ - if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (PASS2 && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; @@ -9366,7 +9402,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) break; case CONTINUE_PAT_MOD: /* 'c' */ - if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (PASS2 && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -9381,7 +9417,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) break; case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { - if (SIZE_ONLY) + if (PASS2) ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; @@ -10513,7 +10549,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (max < min) { /* If can't match, warn and optimize to fail unconditionally */ if (SIZE_ONLY) { - ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); /* We can't back off the size because we have to reserve * enough space for all the things we are about to throw @@ -10522,6 +10557,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; } else { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); RExC_emit = orig_emit; } ret = reg_node(pRExC_state, OPFAIL); @@ -10531,7 +10567,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) && RExC_parse < RExC_end && (*RExC_parse == '?' || *RExC_parse == '+')) { - if (SIZE_ONLY) { + if (PASS2) { ckWARN2reg(RExC_parse + 1, "Useless use of greediness modifier '%c'", *RExC_parse); @@ -10677,10 +10713,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } -STATIC bool +STATIC STRLEN S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, - UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ + UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse ) { @@ -10688,46 +10723,75 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. On successful return, RExC_parse has been updated to point to just after the sequence identified - by this routine, and <*flagp> has been updated. - - The \N may be inside (indicated by the boolean ) or outside a - character class. - - \N may begin either a named sequence, or if outside a character class, mean - to match a non-newline. For non single-quoted regexes, the tokenizer has - attempted to decide which, and in the case of a named sequence, converted it + by this routine, <*flagp> has been updated, and the non-NULL input pointers + have been set appropriately. + + The typical case for this is \N{some character name}. This is usually + called while parsing the input, filling in or ready to fill in an EXACTish + node, and the code point for the character should be returned, so that it + can be added to the node, and parsing continued with the next input + character. But it may be that instead of a single character the \N{} + expands to more than one, a named sequence. In this case any following + quantifier applies to the whole sequence, and it is easier, given the code + structure that calls this, to handle it from a different area of the code. + For this reason, the input parameters can be set so that it returns valid + only on one or the other of these cases. + + Another possibility is for the input to be an empty \N{}, which for + backwards compatibility we accept, but generate a NOTHING node which should + later get optimized out. This is handled from the area of code which can + handle a named sequence, so if called with the parameters for the other, it + fails. + + Still another possibility is for the \N to mean [^\n], and not a single + character or explicit sequence at all. This is determined by context. + Again, this is handled from the area of code which can handle a named + sequence, so if called with the parameters for the other, it also fails. + + And the final possibility is for the \N to be called from within a bracketed + character class. In this case the [^\n] meaning makes no sense, and so is + an error. Other anomalous situations are left to the calling code to handle. + + For non-single-quoted regexes, the tokenizer has attempted to decide which + of the above applies, and in the case of a named sequence, has converted it into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, where c1... are the characters in the sequence. For single-quoted regexes, the tokenizer passes the \N sequence through unchanged; this code will not attempt to determine this nor expand those, instead raising a syntax error. The net effect is that if the beginning of the passed-in pattern isn't '{U+' or there is no '}', it signals that this \N occurrence means to match a - non-newline. + non-newline. (This mostly was done because of [perl #56444].) - Only the \N{U+...} form should occur in a character class, for the same - reason that '.' inside a character class means to just match a period: it - just doesn't make sense. + The API is somewhat convoluted due to historical and the above reasons. The function raises an error (via vFAIL), and doesn't return for various - syntax errors. Otherwise it returns TRUE and sets or on - success; it returns FALSE otherwise. Returns FALSE, setting *flagp to - RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is - only possible if node_p is non-NULL. - + syntax errors. For other failures, it returns (STRLEN) -1. For successes, + it returns a count of how many characters were accounted for by it. (This + can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code + points in the sequence. It sets , , and/or + on success. If is non-null, it means the caller can accept an input sequence - consisting of a just a single code point; <*valuep> is set to that value - if the input is such. - - If is non-null it signifies that the caller can accept any other - legal sequence (i.e., one that isn't just a single code point). <*node_p> - is set as follows: - 1) \N means not-a-NL: points to a newly created REG_ANY node; - 2) \N{}: points to a new NOTHING node; + consisting of a just a single code point; <*valuep> is set to the value + of the only or first code point in the input. + + If is non-null, it means the caller can accept an input + sequence consisting of one or more code points; <*substitute_parse> is a + newly created mortal SV* in this case, containing \x{} escapes representing + those code points. + + Both and can be non-NULL. + + If is non-null, must be NULL. This signifies + that the caller can accept any legal sequence other than a single code + point. To wit, <*node_p> is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1 + 2) \N{}: points to a new NOTHING node; return is 0 3) otherwise: points to a new EXACT node containing the resolved - string. - Note that FALSE is returned for single code point sequences if is - null. + string; return is the number of code points in the + string. This will never be 1. + Note that failure is returned for single code point sequences if is + null and is not. */ char * endbrace; /* '}' following the name */ @@ -10736,6 +10800,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, stream */ bool has_multiple_chars; /* true if the input stream contains a sequence of more than one character */ + bool in_char_class = substitute_parse != NULL; + STRLEN count = 0; /* Number of characters in this sequence */ GET_RE_DEBUG_FLAGS_DECL; @@ -10744,6 +10810,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, GET_RE_DEBUG_FLAGS; assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + assert(! (node_p && substitute_parse)); /* At most 1 should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x * modifier. The other meaning does not, so use a temporary until we find @@ -10762,7 +10829,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } - return FALSE; + return (STRLEN) -1; } RExC_parse--; /* Need to back off so nextchar() doesn't skip the current char */ @@ -10771,7 +10838,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; Set_Node_Length(*node_p, 1); /* MJD */ - return TRUE; + return 1; } /* Here, we have decided it should be a named character or sequence */ @@ -10798,28 +10865,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, } if (endbrace == RExC_parse) { /* empty: \N{} */ - bool ret = TRUE; if (node_p) { *node_p = reg_node(pRExC_state,NOTHING); } - else if (in_char_class) { - if (SIZE_ONLY && in_char_class) { - if (strict) { - RExC_parse++; /* Position after the "}" */ - vFAIL("Zero length \\N{}"); - } - else { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class"); - } - } - ret = FALSE; - } - else { - return FALSE; + else if (! in_char_class) { + return (STRLEN) -1; } nextchar(pRExC_state); - return ret; + return 0; } RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ @@ -10831,90 +10884,103 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, * point, and is terminated by the brace */ has_multiple_chars = (endchar < endbrace); - if (valuep && (! has_multiple_chars || in_char_class)) { - /* We only pay attention to the first char of - multichar strings being returned in char classes. I kinda wonder - if this makes sense as it does change the behaviour - from earlier versions, OTOH that behaviour was broken - as well. XXX Solution is to recharacterize as - [rest-of-class]|multi1|multi2... */ - + /* We get the first code point if we want it, and either there is only one, + * or we can accept both cases of one and more than one */ + if (valuep && (substitute_parse || ! has_multiple_chars)) { STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + | PERL_SCAN_DISALLOW_PREFIX + + /* No errors in the first pass (See [perl + * #122671].) We let the code below find the + * errors when there are multiple chars. */ + | ((SIZE_ONLY || has_multiple_chars) + ? PERL_SCAN_SILENT_ILLDIGIT + : 0); *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); /* The tokenizer should have guaranteed validity, but it's possible to - * bypass it by using single quoting, so check */ - if (length_of_hex == 0 - || length_of_hex != (STRLEN)(endchar - RExC_parse) ) - { - RExC_parse += length_of_hex; /* Includes all the valid */ - RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ - ? UTF8SKIP(RExC_parse) - : 1; - /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) { - RExC_parse = endchar; + * bypass it by using single quoting, so check. Don't do the check + * here when there are multiple chars; we do it below anyway. */ + if (! has_multiple_chars) { + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } + vFAIL("Invalid hexadecimal number in \\N{U+...}"); } - vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } - if (in_char_class && has_multiple_chars) { - if (strict) { - RExC_parse = endbrace; - vFAIL("\\N{} in character class restricted to one character"); - } - else { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); - } + RExC_parse = endbrace + 1; + return 1; } - - RExC_parse = endbrace + 1; } - else if (! node_p || ! has_multiple_chars) { - /* Here, the input is legal, but not according to the caller's - * options. We fail without advancing the parse, so that the - * caller can try again */ + /* Here, we should have already handled the case where a single character + * is expected and found. So it is a failure if we aren't expecting + * multiple chars and got them; or didn't get them but wanted them. We + * fail without advancing the parse, so that the caller can try again with + * different acceptance criteria */ + if ((! node_p && ! substitute_parse) || ! has_multiple_chars) { RExC_parse = p; - return FALSE; + return (STRLEN) -1; } - else { + + { /* What is done here is to convert this to a sub-pattern of the form - * (?:\x{char1}\x{char2}...) - * and then call reg recursively. That way, it retains its atomicness, - * while not having to worry about special handling that some code - * points may have. toke.c has converted the original Unicode values - * to native, so that we can just pass on the hex values unchanged. We - * do have to set a flag to keep recoding from happening in the - * recursion */ - - SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + * \x{char1}\x{char2}... + * and then either return it in <*substitute_parse> if non-null; or + * call reg recursively to parse it (enclosing in "(?: ... )" ). That + * way, it retains its atomicness, while not having to worry about + * special handling that some code points may have. toke.c has + * converted the original Unicode values to native, so that we can just + * pass on the hex values unchanged. We do have to set a flag to keep + * recoding from happening in the recursion */ + + SV * dummy = NULL; STRLEN len; char *orig_end = RExC_end; I32 flags; + if (substitute_parse) { + *substitute_parse = newSVpvs(""); + } + else { + substitute_parse = &dummy; + *substitute_parse = newSVpvs("?:"); + } + *substitute_parse = sv_2mortal(*substitute_parse); + while (RExC_parse < endbrace) { /* Convert to notation the rest of the code understands */ - sv_catpv(substitute_parse, "\\x{"); - sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); - sv_catpv(substitute_parse, "}"); + sv_catpv(*substitute_parse, "\\x{"); + sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(*substitute_parse, "}"); /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + count++; } - sv_catpv(substitute_parse, ")"); + if (! in_char_class) { + sv_catpv(*substitute_parse, ")"); + } - RExC_parse = SvPV(substitute_parse, len); + RExC_parse = SvPV(*substitute_parse, len); /* Don't allow empty number */ - if (len < 8) { + if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) { + RExC_parse = endbrace; vFAIL("Invalid hexadecimal number in \\N{U+...}"); } RExC_end = RExC_parse + len; @@ -10922,15 +10988,17 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, /* The values are Unicode, and therefore not subject to recoding */ RExC_override_recoding = 1; - if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; - return FALSE; + if (node_p) { + if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return (STRLEN) -1; + } + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + (UV) flags); } - FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", - (UV) flags); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; RExC_end = orig_end; @@ -10939,7 +11007,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, nextchar(pRExC_state); } - return TRUE; + return count; } @@ -11045,7 +11113,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, if (LOC || ! FOLD) { /* /l defers folding until runtime */ *character = (U8) code_point; } - else { /* Here is /i and not /l (toFOLD() is defined on just + else { /* Here is /i and not /l. (toFOLD() is defined on just ASCII, which isn't the same thing as INVARIANT on EBCDIC, but it works there, as the extra invariants fold to themselves) */ @@ -11076,7 +11144,10 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, ? FOLD_FLAGS_NOMIX_ASCII : 0)); if (downgradable - && folded == code_point + && folded == code_point /* This quickly rules out many + cases, avoiding the + _invlist_contains_cp() overhead + for those. */ && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) { OP(node) = EXACT; @@ -11261,10 +11332,8 @@ tryagain: nextchar(pRExC_state); if (RExC_flags & RXf_PMf_MULTILINE) ret = reg_node(pRExC_state, MBOL); - else if (RExC_flags & RXf_PMf_SINGLELINE) - ret = reg_node(pRExC_state, SBOL); else - ret = reg_node(pRExC_state, BOL); + ret = reg_node(pRExC_state, SBOL); Set_Node_Length(ret, 1); /* MJD */ break; case '$': @@ -11273,10 +11342,8 @@ tryagain: RExC_seen_zerolen++; if (RExC_flags & RXf_PMf_MULTILINE) ret = reg_node(pRExC_state, MEOL); - else if (RExC_flags & RXf_PMf_SINGLELINE) - ret = reg_node(pRExC_state, SEOL); else - ret = reg_node(pRExC_state, EOL); + ret = reg_node(pRExC_state, SEOL); Set_Node_Length(ret, 1); /* MJD */ break; case '.': @@ -11365,6 +11432,11 @@ tryagain: case 'A': RExC_seen_zerolen++; ret = reg_node(pRExC_state, SBOL); + /* SBOL is shared with /^/ so we set the flags so we can tell + * /\A/ from /^/ in split. We check ret because first pass we + * have no regop struct to set the flags on. */ + if (PASS2) + ret->flags = 1; *flagp |= SIMPLE; goto finish_meta_pat; case 'G': @@ -11396,7 +11468,7 @@ tryagain: ret = reg_node(pRExC_state, CANY); RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; - if (SIZE_ONLY) { + if (PASS2) { ckWARNdep(RExC_parse+1, "\\C is deprecated"); } goto finish_meta_pat; @@ -11425,7 +11497,7 @@ tryagain: ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; - if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + if ((U8) *(RExC_parse + 1) == '{') { /* diag_listed_as: Use "%s" instead of "%s" */ vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } @@ -11443,7 +11515,7 @@ tryagain: ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; - if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + if ((U8) *(RExC_parse + 1) == '{') { /* diag_listed_as: Use "%s" instead of "%s" */ vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } @@ -11552,8 +11624,9 @@ tryagain: * special treatment for quantifiers is not needed for such single * character sequences */ ++RExC_parse; - if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, - FALSE /* not strict */ )) { + if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp, + depth, FALSE)) + { if (*flagp & RESTART_UTF8) return NULL; RExC_parse--; @@ -11854,10 +11927,12 @@ tryagain: * point sequence. Handle those in the switch() above * */ RExC_parse = p + 1; - if (! grok_bslash_N(pRExC_state, NULL, &ender, - flagp, depth, FALSE, - FALSE /* not strict */ )) - { + if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL, + &ender, + flagp, + depth, + FALSE + )) { if (*flagp & RESTART_UTF8) FAIL("panic: grok_bslash_N set RESTART_UTF8"); RExC_parse = p = oldp; @@ -11896,7 +11971,7 @@ tryagain: bool valid = grok_bslash_o(&p, &result, &error_msg, - TRUE, /* out warnings */ + PASS2, /* out warnings */ FALSE, /* not strict */ TRUE, /* Output warnings for non- @@ -11925,7 +12000,7 @@ tryagain: bool valid = grok_bslash_x(&p, &result, &error_msg, - TRUE, /* out warnings */ + PASS2, /* out warnings */ FALSE, /* not strict */ TRUE, /* Output warnings for non- @@ -11948,7 +12023,7 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, SIZE_ONLY); + ender = grok_bslash_c(*p++, PASS2); break; case '8': case '9': /* must be a backreference */ --p; @@ -11987,7 +12062,7 @@ tryagain: REQUIRE_UTF8; } p += numlen; - if (SIZE_ONLY /* like \08, \178 */ + if (PASS2 /* like \08, \178 */ && numlen < 3 && p < RExC_end && isDIGIT(*p) && ckWARN(WARN_REGEXP)) @@ -12004,7 +12079,7 @@ tryagain: if (! RExC_override_recoding) { SV* enc = PL_encoding; ender = reg_recode((const char)(U8)ender, &enc); - if (!enc && SIZE_ONLY) + if (!enc && PASS2) ckWARNreg(p, "Invalid escape in the specified encoding"); REQUIRE_UTF8; } @@ -12455,10 +12530,10 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) UV high; int i; - if (end == UV_MAX && start <= 256) { + if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; } - else if (end >= 256) { + else if (end >= NUM_ANYOF_CODE_POINTS) { ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES; } @@ -12485,10 +12560,10 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) * *invlist_ptr; similarly for code points above the bitmap if we have * a flag to match all of them anyways */ if (change_invlist) { - _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); } if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { - _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr); } /* If have completely emptied it, remove it completely */ @@ -12738,9 +12813,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * upon an unescaped ']' that isn't one ending a regclass. To do both * these things, we need to realize that something preceded by a backslash * is escaped, so we have to keep track of backslashes */ - if (SIZE_ONLY) { - UV depth = 0; /* how many nested (?[...]) constructs */ - + if (PASS2) { Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, @@ -12748,6 +12821,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp))); + } + else { + UV depth = 0; /* how many nested (?[...]) constructs */ while (RExC_parse < RExC_end) { SV* current = NULL; @@ -13248,11 +13324,60 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl default: /* Use deprecated warning to increase the chances of this being * output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + if (PASS2) { + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + } break; } } +STATIC AV * +S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) +{ + /* This adds the string scalar to the array + * . is known to have exactly + * code points in it. This is used when constructing a + * bracketed character class and we find something that needs to match more + * than a single character. + * + * is actually an array of arrays. Each top-level + * element is an array that contains all the strings known so far that are + * the same length. And that length (in number of code points) is the same + * as the index of the top-level array. Hence, the [2] element is an + * array, each element thereof is a string containing TWO code points; + * while element [3] is for strings of THREE characters, and so on. Since + * this is for multi-char strings there can never be a [0] nor [1] element. + * + * When we rewrite the character class below, we will do so such that the + * longest strings are written first, so that it prefers the longest + * matching strings first. This is done even if it turns out that any + * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom + * Christiansen has agreed that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff', for example */ + + AV* this_array; + AV** this_array_ptr; + + PERL_ARGS_ASSERT_ADD_MULTI_MATCH; + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_string); + + return multi_char_matches; +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ @@ -13435,7 +13560,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (UCHARAT(RExC_parse) == ']') goto charclassloop; -parseit: while (1) { if (RExC_parse >= stop_ptr) { break; @@ -13515,19 +13639,54 @@ parseit: case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - /* We only pay attention to the first char of - multichar strings being returned. I kinda wonder - if this makes sense as it does change the behaviour - from earlier versions, OTOH that behaviour was broken - as well. */ - if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, - TRUE, /* => charclass */ - strict)) - { - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); - goto parseit; + SV *as_text; + STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value, + flagp, depth, &as_text); + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + if (cp_count != 1) { /* The typical case drops through */ + assert(cp_count != (STRLEN) -1); + if (cp_count == 0) { + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else if (PASS2) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } + } + else { /* cp_count > 1 */ + if (! RExC_in_multi_char_class) { + if (invert || range || *RExC_parse == '-') { + if (strict) { + RExC_parse--; + vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character"); + } + else if (PASS2) { + ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); + } + } + else { + multi_char_matches + = add_multi_match(multi_char_matches, + as_text, + cp_count); + } + break; /* contains the first code + point. Drop out of the switch to + process it */ + } + } /* End of cp_count != 1 */ + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; /* Back to top of loop to get next char */ } + /* Here, is a single code point, and contains it */ } break; case 'p': @@ -13716,8 +13875,8 @@ parseit: bool valid = grok_bslash_o(&RExC_parse, &value, &error_msg, - SIZE_ONLY, /* warnings in pass - 1 only */ + PASS2, /* warnings only in + pass 2 */ strict, silence_non_portable, UTF); @@ -13736,7 +13895,7 @@ parseit: bool valid = grok_bslash_x(&RExC_parse, &value, &error_msg, - TRUE, /* Output warnings */ + PASS2, /* Output warnings */ strict, silence_non_portable, UTF); @@ -13748,7 +13907,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, PASS2); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -13788,7 +13947,7 @@ parseit: if (strict) { vFAIL("Invalid escape in the specified encoding"); } - else if (SIZE_ONLY) { + else if (PASS2) { ckWARNreg(RExC_parse, "Invalid escape in the specified encoding"); } @@ -13966,22 +14125,23 @@ parseit: namedclass % 2 != 0, posixes_ptr); } - continue; /* Go get next character */ } } /* end of namedclass \blah */ - /* Here, we have a single value. If 'range' is set, it is the ending - * of a range--check its validity. Later, we will handle each - * individual code point in the range. If 'range' isn't set, this - * could be the beginning of a range, so check for that by looking - * ahead to see if the next real character to be processed is the range - * indicator--the minus sign */ - if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, FALSE /* means don't recognize comments */ ); } + /* If 'range' is set, 'value' is the ending of a range--check its + * validity. (If value isn't a single code point in the case of a + * range, we should have figured that out above in the code that + * catches false ranges). Later, we will handle each individual code + * point in the range. If 'range' isn't set, this could be the + * beginning of a range, so check for that by looking ahead to see if + * the next real character to be processed is the range indicator--the + * minus sign */ + if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; @@ -14011,15 +14171,15 @@ parseit: /* a bad range like \w-, [:word:]- ? */ if (namedclass > OOB_NAMEDCLASS) { - if (strict || ckWARN(WARN_REGEXP)) { - const int w = - RExC_parse >= rangebegin ? - RExC_parse - rangebegin : 0; + if (strict || (PASS2 && ckWARN(WARN_REGEXP))) { + const int w = RExC_parse >= rangebegin + ? RExC_parse - rangebegin + : 0; if (strict) { vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); } - else { + else if (PASS2) { vWARN4(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); @@ -14036,8 +14196,12 @@ parseit: } } - /* Here, is the beginning of the range, if any; or - * if not */ + if (namedclass > OOB_NAMEDCLASS) { + continue; + } + + /* Here, we have a single value, and is the beginning of + * the range, if any; or if not */ /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ @@ -14085,44 +14249,17 @@ parseit: * again. Otherwise add this character to the list of * multi-char folds. */ if (! RExC_in_multi_char_class) { - AV** this_array_ptr; - AV* this_array; STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen); SV* multi_fold = sv_2mortal(newSVpvs("")); Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + multi_char_matches + = add_multi_match(multi_char_matches, + multi_fold, + cp_count); - if (! multi_char_matches) { - multi_char_matches = newAV(); - } - - /* is actually an array of arrays. - * There will be one or two top-level elements: [2], - * and/or [3]. The [2] element is an array, each - * element thereof is a character which folds to TWO - * characters; [3] is for folds to THREE characters. - * (Unicode guarantees a maximum of 3 characters in any - * fold.) When we rewrite the character class below, - * we will do so such that the longest folds are - * written first, so that it prefers the longest - * matching strings first. This is done even if it - * turns out that any quantifier is non-greedy, out of - * programmer laziness. Tom Christiansen has agreed - * that this is ok. This makes the test for the - * ligature 'ffi' come before the test for 'ff' */ - if (av_exists(multi_char_matches, cp_count)) { - this_array_ptr = (AV**) av_fetch(multi_char_matches, - cp_count, FALSE); - this_array = *this_array_ptr; - } - else { - this_array = newAV(); - av_store(multi_char_matches, cp_count, - (SV*) this_array); - } - av_push(this_array, multi_fold); } /* This element should not be processed further in this @@ -14236,6 +14373,7 @@ parseit: RExC_parse = SvPV(substitute_parse, len); RExC_end = RExC_parse + len; RExC_in_multi_char_class = 1; + RExC_override_recoding = 1; RExC_emit = (regnode *)orig_emit; ret = reg(pRExC_state, 1, ®_flags, depth+1); @@ -14245,6 +14383,7 @@ parseit: RExC_parse = save_parse; RExC_end = save_end; RExC_in_multi_char_class = 0; + RExC_override_recoding = 0; SvREFCNT_dec_NN(multi_char_matches); return ret; } @@ -15715,8 +15854,6 @@ Perl_regdump(pTHX_ const regexp *r) } if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->intflags & PREGf_ANCH_BOL) - PerlIO_printf(Perl_debug_log, "(BOL)"); if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); if (r->intflags & PREGf_ANCH_SBOL) @@ -16078,6 +16215,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); + else if (OP(o) == SBOL) + Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(sv); @@ -16650,43 +16789,7 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } -/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ - -#ifndef PERL_IN_XSUB_RE -void -Perl_save_re_context(pTHX) -{ - /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ - if (PL_curpm) { - const REGEXP * const rx = PM_GETRE(PL_curpm); - if (rx) { - U32 i; - for (i = 1; i <= RX_NPARENS(rx); i++) { - char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), - "%lu", (long)i); - GV *const *const gvp - = (GV**)hv_fetch(PL_defstash, digits, len, 0); - - if (gvp) { - GV * const gv = *gvp; - if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) - save_scalar(gv); - } - } - } - } -} -#endif - #ifdef DEBUGGING - -/* Given that c is a control character, is it one for which we have a - * mnemonic? */ -#define isMNEMONIC_CNTRL(c) ((isSPACE_A(c) && (c) != '\v') \ - || (c) == '\a' \ - || (c) == '\b' \ - || (c) == ESC_NATIVE) /* Certain characters are output as a sequence with the first being a * backslash. */ #define isBACKSLASHED_PUNCT(c) \ @@ -16707,15 +16810,12 @@ S_put_code_point(pTHX_ SV *sv, UV c) sv_catpvn(sv, &string, 1); } else { - switch ((U8) c) { - case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; - case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break; - case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break; - case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; - case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; - case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; - case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; - default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c); break; + const char * const mnemonic = cntrl_to_mnemonic((char) c); + if (mnemonic) { + Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic); + } + else { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c); } } } @@ -16907,7 +17007,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) * ASCII puncts are set, including an extra amount for the backslashed * ones. */ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (BITMAP_TEST((U8 *) bitmap,i)) { + if (BITMAP_TEST(bitmap, i)) { *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i); if (isPUNCT_A(i)) { punct_count++; @@ -16967,8 +17067,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) /* Add everything remaining to the list, so when we invert it just * below, it will be excluded */ - *invlist_ptr = _add_range_to_invlist(*invlist_ptr, - NUM_ANYOF_CODE_POINTS, UV_MAX); + _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr); _invlist_invert(*invlist_ptr); }