X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/ca902fb80835be4a725df117917a4b62cc7022fe..d8d1dede53afc4f33cf63203b0992459fe964dc3:/regcomp.c diff --git a/regcomp.c b/regcomp.c index f0054e5..1d758a9 100644 --- a/regcomp.c +++ b/regcomp.c @@ -82,7 +82,6 @@ EXTERN_C const struct regexp_engine my_reg_engine; # include "regcomp.h" #endif -#include "dquote_inline.h" #include "invlist_inline.h" #include "unicode_constants.h" @@ -143,6 +142,8 @@ struct RExC_state_t { U32 seen; SSize_t size; /* Number of regnode equivalents in pattern */ + Size_t sets_depth; /* Counts recursion depth of already- + compiled regex set patterns */ /* position beyond 'precomp' of the warning message furthest away from * 'precomp'. During the parse, no warnings are raised for any problems @@ -267,6 +268,7 @@ struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_sets_depth (pRExC_state->sets_depth) #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) #define RExC_study_chunk_recursed_bytes \ (pRExC_state->study_chunk_recursed_bytes) @@ -343,9 +345,14 @@ struct RExC_state_t { #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] #define PBITVAL(paren) (1 << ((paren) & 7)) -#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) -#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) -#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) +#define PAREN_OFFSET(depth) \ + (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) +#define PAREN_TEST(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) +#define PAREN_SET(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) +#define PAREN_UNSET(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) #define REQUIRE_UTF8(flagp) STMT_START { \ if (!UTF) { \ @@ -883,11 +890,27 @@ static const scan_data_t zero_scan_data = { } STMT_END /* m is not necessarily a "literal string", in this macro */ -#define reg_warn_non_literal_string(loc, m) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ +#define warn_non_literal_string(loc, packed_warn, m) \ + _WARN_HELPER(loc, packed_warn, \ + Perl_warner(aTHX_ packed_warn, \ "%s" REPORT_LOCATION, \ m, REPORT_LOCATION_ARGS(loc))) +#define reg_warn_non_literal_string(loc, m) \ + warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) + +#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ + STMT_START { \ + char * format; \ + Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ + Newx(format, format_size, char); \ + my_strlcpy(format, m, format_size); \ + my_strlcat(format, REPORT_LOCATION, format_size); \ + SAVEFREEPV(format); \ + _WARN_HELPER(loc, packwarn, \ + Perl_ck_warner(aTHX_ packwarn, \ + format, \ + a1, REPORT_LOCATION_ARGS(loc))); \ + } STMT_END #define ckWARNreg(loc,m) \ _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ @@ -1379,29 +1402,6 @@ S_edit_distance(const UV* src, /* END of edit_distance() stuff * ========================================================= */ -/* is c a control character for which we have a mnemonic? */ -#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) - -STATIC const char * -S_cntrl_to_mnemonic(const U8 c) -{ - /* Returns the mnemonic string that represents character 'c', if one - * exists; NULL otherwise. The only ones that exist for the purposes of - * this routine are a few control characters */ - - switch (c) { - case '\a': return "\\a"; - case '\b': return "\\b"; - case ESC_NATIVE: return "\\e"; - case '\f': return "\\f"; - case '\n': return "\\n"; - case '\r': return "\\r"; - case '\t': return "\\t"; - } - - return NULL; -} - /* Mark that we cannot extend a found fixed substring at this point. Update the longest found anchored substring or the longest found floating substrings if needed. */ @@ -3589,7 +3589,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, str=STRING(convert); setSTR_LEN(convert, 0); } - setSTR_LEN(convert, STR_LEN(convert) + len); + assert( ( STR_LEN(convert) + len ) < 256 ); + setSTR_LEN(convert, (U8)(STR_LEN(convert) + len)); while (len--) *str++ = *ch++; } else { @@ -3603,7 +3604,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->prefixlen = (state-1); if (str) { regnode *n = convert+NODE_SZ_STR(convert); - NEXT_OFF(convert) = NODE_SZ_STR(convert); + assert( NODE_SZ_STR(convert) <= U16_MAX ); + NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert)); trie->startstate = state; trie->minlen -= (state - 1); trie->maxlen -= (state - 1); @@ -4029,11 +4031,6 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * using /iaa matching will be doing so almost entirely with ASCII * strings, so this should rarely be encountered in practice */ -#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ - if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \ - && OP(scan) != LEXACT_REQ8) \ - join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1) - STATIC U32 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *unfolded_multi_char, @@ -4197,7 +4194,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, merged++; NEXT_OFF(scan) += NEXT_OFF(n); - setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n)); + assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 ); + setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n))); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); @@ -4422,23 +4420,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, } #endif } - - if ( STR_LEN(scan) == 1 - && isALPHA_A(* STRING(scan)) - && ( OP(scan) == EXACTFAA - || ( OP(scan) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan))))) - { - U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ - - /* Replace a length 1 ASCII fold pair node with an ANYOFM node, - * with the mask set to the complement of the bit that differs - * between upper and lower case, and the lowest code point of the - * pair (which the '&' forces) */ - OP(scan) = ANYOFM; - ARG_SET(scan, *STRING(scan) & mask); - FLAGS(scan) = mask; - } } #ifdef DEBUGGING @@ -4498,6 +4479,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; + SSize_t final_minlen; /* There must be at least this number of characters to match */ SSize_t min = 0; I32 pars = 0, code; @@ -4541,15 +4523,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 j; for ( j = 0 ; j < recursed_depth ; j++ ) { for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { - if ( - PAREN_TEST(RExC_study_chunk_recursed + - ( j * RExC_study_chunk_recursed_bytes), i ) - && ( - !j || - !PAREN_TEST(RExC_study_chunk_recursed + - (( j - 1 ) * RExC_study_chunk_recursed_bytes), i) - ) - ) { + if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) { Perl_re_printf( aTHX_ " %d",(int)i); break; } @@ -4577,7 +4551,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * parsing code, as each (?:..) is handled by a different invocation of * reg() -- Yves */ - JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT + && OP(scan) != LEXACT_REQ8) + join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, + 0, NULL, depth + 1); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -5151,8 +5128,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( !recursed_depth - || - !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + || !PAREN_TEST(recursed_depth - 1, paren) ) { /* it is quite possible that there are more efficient ways * to do this. We maintain a bitmap per level of recursion @@ -5167,13 +5143,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (!recursed_depth) { Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); } else { - Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), - RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + Copy(PAREN_OFFSET(recursed_depth - 1), + PAREN_OFFSET(recursed_depth), RExC_study_chunk_recursed_bytes, U8); } /* we havent recursed into this paren yet, so recurse into it */ DEBUG_STUDYDATA("gosub-set", data, depth, is_inf); - PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + PAREN_SET(recursed_depth, paren); my_recursed_depth= recursed_depth + 1; } else { DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf); @@ -5238,17 +5214,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == LEXACT_REQ8 || OP(scan) == EXACTL) { - SSize_t l = STR_LEN(scan); + SSize_t bytelen = STR_LEN(scan), charlen; UV uc; - assert(l); + assert(bytelen); if (UTF) { const U8 * const s = (U8*)STRING(scan); - uc = utf8_to_uvchr_buf(s, s + l, NULL); - l = utf8_length(s, s + l); + uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); + charlen = utf8_length(s, s + bytelen); } else { uc = *((U8*)STRING(scan)); + charlen = bytelen; } - min += l; + min += charlen; if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ /* The code below prefers earlier match for fixed offset, later match for variable offset. */ @@ -5257,7 +5234,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->last_start_max = is_inf ? SSize_t_MAX : data->pos_min + data->pos_delta; } - sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + sv_catpvn(data->last_found, STRING(scan), bytelen); if (UTF) SvUTF8_on(data->last_found); { @@ -5265,11 +5242,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + mg->mg_len += charlen; } - data->last_end = data->pos_min + l; - data->pos_min += l; /* As in the first entry. */ + data->last_end = data->pos_min + charlen; + data->pos_min += charlen; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } @@ -5291,25 +5267,42 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is EXACTFish */ - SSize_t l = STR_LEN(scan); + SSize_t bytelen = STR_LEN(scan), charlen; const U8 * s = (U8*)STRING(scan); + /* Replace a length 1 ASCII fold pair node with an ANYOFM node, + * with the mask set to the complement of the bit that differs + * between upper and lower case, and the lowest code point of the + * pair (which the '&' forces) */ + if ( bytelen == 1 + && isALPHA_A(*s) + && ( OP(scan) == EXACTFAA + || ( OP(scan) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))) + { + U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ + + OP(scan) = ANYOFM; + ARG_SET(scan, *s & mask); + FLAGS(scan) = mask; + /* we're not EXACTFish any more, so restudy */ + continue; + } + /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); scan_commit(pRExC_state, data, minlenp, is_inf); } - if (UTF) { - l = utf8_length(s, s + l); - } + charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen; if (unfolded_multi_char) { RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } - min += l - min_subtract; + min += charlen - min_subtract; assert (min >= 0); delta += min_subtract; if (flags & SCF_DO_SUBSTR) { - data->pos_min += l - min_subtract; + data->pos_min += charlen - min_subtract; if (data->pos_min < 0) { data->pos_min = 0; } @@ -5779,10 +5772,8 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", continue; default: -#ifdef DEBUGGING Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", OP(scan)); -#endif case REF: case CLUMP: if (flags & SCF_DO_SUBSTR) { @@ -6433,6 +6424,11 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", if (trie->jump) /* no more substrings -- for now /grr*/ flags &= ~SCF_DO_SUBSTR; } + else if (OP(scan) == REGEX_SET) { + Perl_croak(aTHX_ "panic: %s regnode should be resolved" + " before optimization", reg_name[REGEX_SET]); + } + #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -6484,18 +6480,15 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", DEBUG_STUDYDATA("post-fin", data, depth, is_inf); - { - SSize_t final_minlen= min < stopmin ? min : stopmin; - - if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { - if (final_minlen > SSize_t_MAX - delta) - RExC_maxlen = SSize_t_MAX; - else if (RExC_maxlen < final_minlen + delta) - RExC_maxlen = final_minlen + delta; - } - return final_minlen; + final_minlen = min < stopmin + ? min : stopmin; + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { + if (final_minlen > SSize_t_MAX - delta) + RExC_maxlen = SSize_t_MAX; + else if (RExC_maxlen < final_minlen + delta) + RExC_maxlen = final_minlen + delta; } - NOT_REACHED; /* NOTREACHED */ + return final_minlen; } STATIC U32 @@ -7685,6 +7678,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_study_chunk_recursed = NULL; RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; + RExC_sets_depth = 0; pRExC_state->code_index = 0; /* Initialize the string in the compiled pattern. This is so that there is @@ -12185,7 +12179,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } break; } - DEBUG_PARSE_r( + DEBUG_PARSE_r({ DEBUG_PARSE_MSG("lsbr"); regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state); regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state); @@ -12196,7 +12190,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)ender, (IV)(ender - lastbr) ); - ); + }); if (! REGTAIL(pRExC_state, lastbr, ender)) { REQUIRE_BRANCHJ(flagp, 0); } @@ -12237,7 +12231,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) br= PL_regkind[OP(ret_as_regnode)] != BRANCH ? regnext(ret_as_regnode) : ret_as_regnode; - DEBUG_PARSE_r( + DEBUG_PARSE_r({ DEBUG_PARSE_MSG("NADA"); regprop(RExC_rx, RExC_mysv1, ret_as_regnode, NULL, pRExC_state); @@ -12250,7 +12244,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)ender, (IV)(ender - ret) ); - ); + }); OP(br)= NOTHING; if (OP(REGNODE_p(ender)) == TAIL) { NEXT_OFF(br)= 0; @@ -12621,12 +12615,22 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } nest_check: if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { - ckWARN2reg(RExC_parse, - "%" UTF8f " matches null string many times", - UTF8fARG(UTF, (RExC_parse >= origparse - ? RExC_parse - origparse - : 0), - origparse)); + if (origparse[0] == '\\' && origparse[1] == 'K') { + vFAIL2utf8f( + "%" UTF8f " is forbidden - matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + /* NOT-REACHED */ + } else { + ckWARN2reg(RExC_parse, + "%" UTF8f " matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + } } if (*RExC_parse == '?') { @@ -12959,48 +12963,30 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * thing. */ do { /* Loop until the ending brace */ - UV cp = 0; - char * start_digit; /* The first of the current code point */ - if (! isXDIGIT(*RExC_parse)) { + I32 flags = PERL_SCAN_SILENT_OVERFLOW + | PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_NOTIFY_ILLDIGIT + | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + STRLEN len = endbrace - RExC_parse; + NV overflow_value; + char * start_digit = RExC_parse; + UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value); + + if (len == 0) { RExC_parse++; + bad_NU: vFAIL("Invalid hexadecimal number in \\N{U+...}"); } - start_digit = RExC_parse; - count++; - - /* Loop through the hex digits of the current code point */ - do { - /* Adding this digit will shift the result 4 bits. If that - * result would be above the legal max, it's overflow */ - if (cp > MAX_LEGAL_CP >> 4) { + RExC_parse += len; - /* Find the end of the code point */ - do { - RExC_parse ++; - } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_'); - - /* Be sure to synchronize this message with the similar one - * in utf8.c */ - vFAIL4("Use of code point 0x%.*s is not allowed; the" - " permissible max is 0x%" UVxf, - (int) (RExC_parse - start_digit), start_digit, - MAX_LEGAL_CP); - } - - /* Accumulate this (valid) digit into the running total */ - cp = (cp << 4) + READ_XDIGIT(RExC_parse); - - /* READ_XDIGIT advanced the input pointer. Ignore a single - * underscore separator */ - if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) { - RExC_parse++; - } - } while (isXDIGIT(*RExC_parse)); + if (cp > MAX_LEGAL_CP) { + vFAIL(form_cp_too_large_msg(16, start_digit, len, 0)); + } - /* Here, have accumulated the next code point */ - if (RExC_parse >= endbrace) { /* If done ... */ - if (count != 1) { + if (RExC_parse >= endbrace) { /* Got to the closing '}' */ + if (count) { goto do_concat; } @@ -13017,18 +13003,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, return TRUE; } - /* Here, the only legal thing would be a multiple character - * sequence (of the form "\N{U+c1.c2. ... }". So the next - * character must be a dot (and the one after that can't be the - * endbrace, or we'd have something like \N{U+100.} ) */ + /* Here, the parse stopped bfore the ending brace. This is legal + * only if that character is a dot separating code points, like a + * multiple character sequence (of the form "\N{U+c1.c2. ... }". + * So the next character must be a dot (and the one after that + * can't be the endbrace, or we'd have something like \N{U+100.} ) + * */ if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) { RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ - ? UTF8SKIP(RExC_parse) - : 1; - if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */ - RExC_parse = endbrace; - } - vFAIL("Invalid hexadecimal number in \\N{U+...}"); + ? UTF8SKIP(RExC_parse) + : 1; + RExC_parse = MIN(endbrace, RExC_parse);/* Guard against + malformed utf8 */ + goto bad_NU; } /* Here, looks like its really a multiple character sequence. Fail @@ -13046,7 +13033,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * but go through the motions of code point counting and error * checking, if the caller doesn't want a node returned. */ - if (node_p && count == 1) { + if (node_p && ! substitute_parse) { substitute_parse = newSVpvs("?:"); } @@ -13962,6 +13949,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) || ! is_PATWS_safe((p), RExC_end, UTF)); switch ((U8)*p) { + const char* message; + U32 packed_warn; + U8 grok_c_char; + case '^': case '$': case '.': @@ -14077,67 +14068,70 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) p++; break; case 'o': - { - UV result; - const char* error_msg; - - bool valid = grok_bslash_o(&p, - RExC_end, - &result, - &error_msg, - TO_OUTPUT_WARNINGS(p), - (bool) RExC_strict, - TRUE, /* Output warnings - for non- - portables */ - UTF); - if (! valid) { - RExC_parse = p; /* going to die anyway; point - to exact spot of failure */ - vFAIL(error_msg); - } - UPDATE_WARNINGS_LOC(p - 1); - ender = result; - break; - } + if (! grok_bslash_o(&p, + RExC_end, + &ender, + &message, + &packed_warn, + (bool) RExC_strict, + FALSE, /* No illegal cp's */ + UTF)) + { + RExC_parse = p; /* going to die anyway; point to + exact spot of failure */ + vFAIL(message); + } + + if (message && TO_OUTPUT_WARNINGS(p)) { + warn_non_literal_string(p, packed_warn, message); + } + break; case 'x': - { - UV result = UV_MAX; /* initialize to erroneous - value */ - const char* error_msg; - - bool valid = grok_bslash_x(&p, - RExC_end, - &result, - &error_msg, - TO_OUTPUT_WARNINGS(p), - (bool) RExC_strict, - TRUE, /* Silence warnings - for non- - portables */ - UTF); - if (! valid) { - RExC_parse = p; /* going to die anyway; point - to exact spot of failure */ - vFAIL(error_msg); - } - UPDATE_WARNINGS_LOC(p - 1); - ender = result; + if (! grok_bslash_x(&p, + RExC_end, + &ender, + &message, + &packed_warn, + (bool) RExC_strict, + FALSE, /* No illegal cp's */ + UTF)) + { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(message); + } + + if (message && TO_OUTPUT_WARNINGS(p)) { + warn_non_literal_string(p, packed_warn, message); + } #ifdef EBCDIC - if (ender < 0x100) { - if (RExC_recode_x_to_native) { - ender = LATIN1_TO_NATIVE(ender); - } - } + if (ender < 0x100) { + if (RExC_recode_x_to_native) { + ender = LATIN1_TO_NATIVE(ender); + } + } #endif - break; - } + break; case 'c': - p++; - ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p)); - UPDATE_WARNINGS_LOC(p); p++; + if (! grok_bslash_c(*p, &grok_c_char, + &message, &packed_warn)) + { + /* going to die anyway; point to exact spot of + * failure */ + RExC_parse = p + ((UTF) + ? UTF8_SAFE_SKIP(p, RExC_end) + : 1); + vFAIL(message); + } + + ender = grok_c_char; + p++; + if (message && TO_OUTPUT_WARNINGS(p)) { + warn_non_literal_string(p, packed_warn, message); + } + break; case '8': case '9': /* must be a backreference */ --p; @@ -14172,17 +14166,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* FALLTHROUGH */ case '0': { - I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + I32 flags = PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_NOTIFY_ILLDIGIT; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); p += numlen; - if ( isDIGIT(*p) /* like \08, \178 */ - && ckWARN(WARN_REGEXP) - && numlen < 3) + if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT) + && isDIGIT(*p) /* like \08, \178 */ + && ckWARN(WARN_REGEXP)) { reg_warn_non_literal_string( - p + 1, - form_short_octal_warning(p, numlen)); + p + 1, + form_alien_digit_msg(8, numlen, p, + RExC_end, UTF, FALSE)); } } break; @@ -14259,6 +14255,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (ender > 255) { REQUIRE_UTF8(flagp); + if ( UNICODE_IS_PERL_EXTENDED(ender) + && TO_OUTPUT_WARNINGS(p)) + { + ckWARN2_non_literal_string(p, + packWARN(WARN_PORTABLE), + PL_extended_cp_format, + ender); + } } /* We need to check if the next non-ignored thing is a @@ -14587,8 +14591,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else if (FOLD) { bool splittable = FALSE; bool backed_up = FALSE; - char * e; - char * s_start; + char * e; /* should this be U8? */ + char * s_start; /* should this be U8? */ /* Here is /i. Running out of room creates a problem if we are * folding, and the split happens in the middle of a @@ -14905,7 +14909,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if ( ender != LATIN_SMALL_LETTER_SHARP_S || ASCII_FOLD_RESTRICTED) { - *e++ = toLOWER_L1(ender); + assert( toLOWER_L1(ender) < 256 ); + *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */ } else { *e++ = 's'; @@ -14923,7 +14928,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S || ASCII_FOLD_RESTRICTED) { - *e++ = toLOWER_L1(ender); + assert( toLOWER_L1(ender) < 256 ); + *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */ } else { *e++ = 's'; @@ -16232,7 +16238,10 @@ redo_curchar: && UCHARAT(RExC_parse + 1) == '?' && UCHARAT(RExC_parse + 2) == '^') { - /* If is a '(?', could be an embedded '(?^flags:(?[...])'. + const regnode_offset orig_emit = RExC_emit; + SV * resultant_invlist; + + /* If is a '(?^', could be an embedded '(?^flags:(?[...])'. * This happens when we have some thing like * * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; @@ -16241,62 +16250,33 @@ redo_curchar: * * Here we would be handling the interpolated * '$thai_or_lao'. We handle this by a recursive call to - * ourselves which returns the inversion list the - * interpolated expression evaluates to. We use the flags - * from the interpolated pattern. */ - U32 save_flags = RExC_flags; - const char * save_parse; - - RExC_parse += 2; /* Skip past the '(?' */ - save_parse = RExC_parse; - - /* Parse the flags for the '(?'. We already know the first - * flag to parse is a '^' */ - parse_lparen_question_flags(pRExC_state); - - if ( RExC_parse >= RExC_end - 4 - || UCHARAT(RExC_parse) != ':' - || UCHARAT(++RExC_parse) != '(' - || UCHARAT(++RExC_parse) != '?' - || UCHARAT(++RExC_parse) != '[') - { + * reg which returns the inversion list the + * interpolated expression evaluates to. Actually, the + * return is a special regnode containing a pointer to that + * inversion list. If the return isn't that regnode alone, + * we know that this wasn't such an interpolation, which is + * an error: we need to get a single inversion list back + * from the recursion */ - /* In combination with the above, this moves the - * pointer to the point just after the first erroneous - * character. */ - if (RExC_parse >= RExC_end - 4) { - RExC_parse = RExC_end; - } - else if (RExC_parse != save_parse) { - RExC_parse += (UTF) - ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) - : 1; - } - vFAIL("Expecting '(?flags:(?[...'"); - } - - /* Recurse, with the meat of the embedded expression */ RExC_parse++; - if (! handle_regex_sets(pRExC_state, ¤t, flagp, - depth+1, oregcomp_parse)) - { - RETURN_FAIL_ON_RESTART(*flagp, flagp); - } + RExC_sets_depth++; - /* Here, 'current' contains the embedded expression's - * inversion list, and RExC_parse points to the trailing - * ']'; the next character should be the ')' */ - RExC_parse++; - if (UCHARAT(RExC_parse) != ')') - vFAIL("Expecting close paren for nested extended charclass"); + node = reg(pRExC_state, 2, flagp, depth+1); + RETURN_FAIL_ON_RESTART(*flagp, flagp); - /* Then the ')' matching the original '(' handled by this - * case: statement */ - RExC_parse++; - if (UCHARAT(RExC_parse) != ')') - vFAIL("Expecting close paren for wrapper for nested extended charclass"); + if ( OP(REGNODE_p(node)) != REGEX_SET + /* If more than a single node returned, the nested + * parens evaluated to more than just a (?[...]), + * which isn't legal */ + || node != 1) { + vFAIL("Expecting interpolated extended charclass"); + } + resultant_invlist = (SV *) ARGp(REGNODE_p(node)); + current = invlist_clone(resultant_invlist, NULL); + SvREFCNT_dec(resultant_invlist); - RExC_flags = save_flags; + RExC_sets_depth--; + RExC_emit = orig_emit; goto handle_operand; } @@ -16684,6 +16664,13 @@ redo_curchar: return END; } + if (RExC_sets_depth) { /* If within a recursive call, return in a special + regnode */ + RExC_parse++; + node = regpnode(pRExC_state, REGEX_SET, (void *) final); + } + else { + /* Otherwise generate a resultant node, based on 'final'. regclass() is * expecting a string of ranges and individual code points */ invlist_iterinit(final); @@ -16767,6 +16754,7 @@ redo_curchar: ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } + } nextchar(pRExC_state); Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -16934,6 +16922,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings) PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS; if (! TO_OUTPUT_WARNINGS(RExC_parse)) { + CLEAR_POSIX_WARNINGS(); return; } @@ -17364,6 +17353,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * is already in 'value'. Otherwise, need to translate the escape * into what it signifies. */ if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { + const char * message; + U32 packed_warn; + U8 grok_c_char; case 'w': namedclass = ANYOF_WORDCHAR; break; case 'W': namedclass = ANYOF_NWORDCHAR; break; @@ -17614,53 +17606,74 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ - { - const char* error_msg; - bool valid = grok_bslash_o(&RExC_parse, - RExC_end, - &value, - &error_msg, - TO_OUTPUT_WARNINGS(RExC_parse), - strict, - silence_non_portable, - UTF); - if (! valid) { - vFAIL(error_msg); - } - UPDATE_WARNINGS_LOC(RExC_parse - 1); - } - non_portable_endpoint++; + if (! grok_bslash_o(&RExC_parse, + RExC_end, + &value, + &message, + &packed_warn, + strict, + cBOOL(range), /* MAX_UV allowed for range + upper limit */ + UTF)) + { + vFAIL(message); + } + else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { + warn_non_literal_string(RExC_parse, packed_warn, message); + } + + if (value < 256) { + non_portable_endpoint++; + } break; case 'x': RExC_parse--; /* function expects to be pointed at the 'x' */ - { - const char* error_msg; - bool valid = grok_bslash_x(&RExC_parse, - RExC_end, - &value, - &error_msg, - TO_OUTPUT_WARNINGS(RExC_parse), - strict, - silence_non_portable, - UTF); - if (! valid) { - vFAIL(error_msg); - } - UPDATE_WARNINGS_LOC(RExC_parse - 1); - } - non_portable_endpoint++; + if (! grok_bslash_x(&RExC_parse, + RExC_end, + &value, + &message, + &packed_warn, + strict, + cBOOL(range), /* MAX_UV allowed for range + upper limit */ + UTF)) + { + vFAIL(message); + } + else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { + warn_non_literal_string(RExC_parse, packed_warn, message); + } + + if (value < 256) { + non_portable_endpoint++; + } break; case 'c': - value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse)); - UPDATE_WARNINGS_LOC(RExC_parse); - RExC_parse++; + if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message, + &packed_warn)) + { + /* going to die anyway; point to exact spot of + * failure */ + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; + vFAIL(message); + } + + value = grok_c_char; + RExC_parse++; + if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { + warn_non_literal_string(RExC_parse, packed_warn, message); + } + non_portable_endpoint++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { /* Take 1-3 octal digits */ - I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + I32 flags = PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_NOTIFY_ILLDIGIT; numlen = (strict) ? 4 : 3; value = grok_oct(--RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; @@ -17671,17 +17684,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, : 1; vFAIL("Need exactly 3 octal digits"); } - else if ( numlen < 3 /* like \08, \178 */ + else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT) && RExC_parse < RExC_end && isDIGIT(*RExC_parse) && ckWARN(WARN_REGEXP)) { reg_warn_non_literal_string( RExC_parse + 1, - form_short_octal_warning(RExC_parse, numlen)); + form_alien_digit_msg(8, numlen, RExC_parse, + RExC_end, UTF, FALSE)); } } - non_portable_endpoint++; + if (value < 256) { + non_portable_endpoint++; + } break; } default: @@ -17943,7 +17959,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* non-Latin1 code point implies unicode semantics. */ if (value > 255) { + if (value > MAX_LEGAL_CP && ( value != UV_MAX + || prevvalue > MAX_LEGAL_CP)) + { + vFAIL(form_cp_too_large_msg(16, NULL, 0, value)); + } REQUIRE_UNI_RULES(flagp, 0); + if ( ! silence_non_portable + && UNICODE_IS_PERL_EXTENDED(value) + && TO_OUTPUT_WARNINGS(RExC_parse)) + { + ckWARN2_non_literal_string(RExC_parse, + packWARN(WARN_PORTABLE), + PL_extended_cp_format, + value); + } } /* Ready to process either the single value, or the completed range. @@ -18668,17 +18698,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, invert = FALSE; } + /* All possible optimizations below still have these characteristics. + * (Multi-char folds aren't SIMPLE, but they don't get this far in this + * routine) */ + *flagp |= HASWIDTH|SIMPLE; + if (ret_invlist) { *ret_invlist = cp_list; return RExC_emit; } - /* All possible optimizations below still have these characteristics. - * (Multi-char folds aren't SIMPLE, but they don't get this far in this - * routine) */ - *flagp |= HASWIDTH|SIMPLE; - if (anyof_flags & ANYOF_LOCALE_FLAGS) { RExC_contains_locale = 1; } @@ -19848,10 +19878,11 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, /* The data consists of just strings defining user-defined * property names, but in prior incarnations, and perhaps * somehow from pluggable regex engines, it could still - * hold hex code point definitions. Each component of a - * range would be separated by a tab, and each range by a - * new-line. If these are found, instead add them to the - * inversion list */ + * hold hex code point definitions, all of which should be + * legal (or it wouldn't have gotten this far). Each + * component of a range would be separated by a tab, and + * each range by a new-line. If these are found, instead + * add them to the inversion list */ I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT |PERL_SCAN_SILENT_NON_PORTABLE; STRLEN len = remaining; @@ -20176,6 +20207,22 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) return(ret); } +/* +- regpnode - emit a temporary node with a void* argument +*/ +STATIC regnode_offset /* Location. */ +S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg) +{ + const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode"); + regnode_offset ptr = ret; + + PERL_ARGS_ASSERT_REGPNODE; + + FILL_ADVANCE_NODE_ARGp(ptr, op, arg); + RExC_emit = ptr; + return(ret); +} + STATIC regnode_offset S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) { @@ -23163,9 +23210,10 @@ Perl_parse_uniprop_string(pTHX_ dVAR; char* lookup_name; /* normalized name for lookup in our tables */ unsigned lookup_len; /* Its length */ - bool stricter = FALSE; /* Some properties have stricter name - normalization rules, which we decide upon - based on parsing */ + enum { Not_Strict = 0, /* Some properties have stricter name */ + Strict, /* normalization rules, which we decide */ + As_Is /* upon based on parsing */ + } stricter = Not_Strict; /* nv= or numeric_value=, or possibly one of the cjk numeric properties * (though it requires extra effort to download them from Unicode and @@ -23316,7 +23364,7 @@ Perl_parse_uniprop_string(pTHX_ * or are positioned just after the '=' if it is compound. */ if (equals_pos >= 0) { - assert(! stricter); /* We shouldn't have set this yet */ + assert(stricter == Not_Strict); /* We shouldn't have set this yet */ /* Space immediately after the '=' is ignored */ i++; @@ -23495,6 +23543,93 @@ Perl_parse_uniprop_string(pTHX_ * some constructs in their subpattern, like \A. */ } /* End of is a wildcard subppattern */ + /* \p{name=...} is handled specially. Instead of using the normal + * mechanism involving charclass_invlists.h, it uses _charnames.pm + * which has the necessary (huge) data accessible to it, and which + * doesn't get loaded unless necessary. The legal syntax for names is + * somewhat different than other properties due both to the vagaries of + * a few outlier official names, and the fact that only a few ASCII + * characters are permitted in them */ + if ( memEQs(lookup_name, j - 1, "name") + || memEQs(lookup_name, j - 1, "na")) + { + dSP; + HV * table; + SV * character; + const char * error_msg; + CV* lookup_loose; + SV * character_name; + STRLEN character_len; + UV cp; + + stricter = As_Is; + + /* Since the RHS (after skipping initial space) is passed unchanged + * to charnames, and there are different criteria for what are + * legal characters in the name, just parse it here. A character + * name must begin with an ASCII alphabetic */ + if (! isALPHA(name[i])) { + goto failed; + } + lookup_name[j++] = name[i]; + + for (++i; i < name_len; i++) { + /* Official names can only be in the ASCII range, and only + * certain characters */ + if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) { + goto failed; + } + lookup_name[j++] = name[i]; + } + + /* Finished parsing, save the name into an SV */ + character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos); + + /* Make sure _charnames is loaded. (The parameters give context + * for any errors generated */ + table = load_charnames(character_name, name, name_len, &error_msg); + if (table == NULL) { + sv_catpv(msg, error_msg); + goto append_name_to_msg; + } + + lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0); + if (! lookup_loose) { + Perl_croak(aTHX_ + "panic: Can't find '_charnames::_loose_regcomp_lookup"); + } + + PUSHSTACKi(PERLSI_OVERLOAD); + ENTER ; + SAVETMPS; + + PUSHMARK(SP) ; + XPUSHs(character_name); + PUTBACK; + call_sv(MUTABLE_SV(lookup_loose), G_SCALAR); + + SPAGAIN ; + + character = POPs; + SvREFCNT_inc_simple_void_NN(character); + + PUTBACK ; + FREETMPS ; + LEAVE ; + POPSTACK; + + if (! SvOK(character)) { + goto failed; + } + + cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len); + if (character_len < SvCUR(character)) { + goto failed; + } + + prop_definition = add_cp_to_invlist(NULL, cp); + return prop_definition; + } /* Certain properties whose values are numeric need special handling. * They may optionally be prefixed by 'is'. Ignore that prefix for the @@ -23547,12 +23682,12 @@ Perl_parse_uniprop_string(pTHX_ * But the numeric type properties can have the alphas [Ee] to * signify an exponent, and it is still a number with stricter * rules. So look for an alpha that signifies not-strict */ - stricter = TRUE; + stricter = Strict; for (k = i; k < name_len; k++) { if ( isALPHA_A(name[k]) && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E'))) { - stricter = FALSE; + stricter = Not_Strict; break; } } @@ -23590,7 +23725,7 @@ Perl_parse_uniprop_string(pTHX_ && memNEs(lookup_name + 4, j - 4, "space") && memNEs(lookup_name + 4, j - 4, "word")) { - stricter = TRUE; + stricter = Strict; /* We set the inputs back to 0 and the code below will reparse, * using strict */ @@ -23938,6 +24073,7 @@ Perl_parse_uniprop_string(pTHX_ * but not yet used. */ save_item(PL_subname); + /* G_SCALAR guarantees a single return value */ (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR); SPAGAIN; @@ -23965,7 +24101,7 @@ Perl_parse_uniprop_string(pTHX_ (void) POPs; prop_definition = NULL; } - else { /* G_SCALAR guarantees a single return value */ + else { SV * contents = POPs; /* The contents is supposed to be the expansion of the property