X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/5b9ce45660f01f87f17ed2211a1c01540d903a41..ca76e4e9adfa1629d8fee4608cbeab9d9ba91ed3:/regcomp.c diff --git a/regcomp.c b/regcomp.c index f748e79..68c599f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -105,6 +105,10 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif +#ifndef MAX +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + /* this is a chain of data about sub patterns we are processing that need to be handled separately/specially in study_chunk. Its so we can simulate recursion without losing state. */ @@ -131,6 +135,7 @@ struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ + char *precomp_end; /* pointer to end of uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object @@ -138,6 +143,8 @@ struct RExC_state_t { char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ + char *adjusted_start; /* 'start', adjusted. See code use */ + STRLEN precomp_adj; /* an offset beyond precomp. See code use */ SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ regnode *emit_bound; /* First regnode outside of the @@ -192,7 +199,6 @@ struct RExC_state_t { scan_frame *frame_head; scan_frame *frame_last; U32 frame_count; - U32 strict; #ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -215,11 +221,15 @@ struct RExC_state_t { #endif bool seen_unfolded_sharp_s; + bool strict; }; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_precomp_adj (pRExC_state->precomp_adj) +#define RExC_adjusted_start (pRExC_state->adjusted_start) +#define RExC_precomp_end (pRExC_state->precomp_end) #define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) @@ -554,9 +564,67 @@ static const scan_data_t zero_scan_data = #define REPORT_LOCATION " in regex; marked by " MARKER1 \ " in m/%"UTF8f MARKER2 "%"UTF8f"/" -#define REPORT_LOCATION_ARGS(offset) \ - UTF8fARG(UTF, offset, RExC_precomp), \ - UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) +/* The code in this file in places uses one level of recursion with parsing + * rebased to an alternate string constructed by us in memory. This can take + * the form of something that is completely different from the input, or + * something that uses the input as part of the alternate. In the first case, + * there should be no possibility of an error, as we are in complete control of + * the alternate string. But in the second case we don't control the input + * portion, so there may be errors in that. Here's an example: + * /[abc\x{DF}def]/ui + * is handled specially because \x{df} folds to a sequence of more than one + * character, 'ss'. What is done is to create and parse an alternate string, + * which looks like this: + * /(?:\x{DF}|[abc\x{DF}def])/ui + * where it uses the input unchanged in the middle of something it constructs, + * which is a branch for the DF outside the character class, and clustering + * parens around the whole thing. (It knows enough to skip the DF inside the + * class while in this substitute parse.) 'abc' and 'def' may have errors that + * need to be reported. The general situation looks like this: + * + * sI tI xI eI + * Input: ---------------------------------------------------- + * Constructed: --------------------------------------------------- + * sC tC xC eC EC + * + * The input string sI..eI is the input pattern. The string sC..EC is the + * constructed substitute parse string. The portions sC..tC and eC..EC are + * constructed by us. The portion tC..eC is an exact duplicate of the input + * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that + * while parsing, we find an error at xC. We want to display a message showing + * the real input string. Thus we need to find the point xI in it which + * corresponds to xC. xC >= tC, since the portion of the string sC..tC has + * been constructed by us, and so shouldn't have errors. We get: + * + * xI = sI + (tI - sI) + (xC - tC) + * + * and, the offset into sI is: + * + * (xI - sI) = (tI - sI) + (xC - tC) + * + * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj, + * and we save tC as RExC_adjusted_start. + * + * During normal processing of the input pattern, everything points to that, + * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI. + */ + +#define tI_sI RExC_precomp_adj +#define tC RExC_adjusted_start +#define sC RExC_precomp +#define xI_offset(xC) ((IV) (tI_sI + (xC - tC))) +#define xI(xC) (sC + xI_offset(xC)) +#define eC RExC_precomp_end + +#define REPORT_LOCATION_ARGS(xC) \ + UTF8fARG(UTF, \ + (xI(xC) > eC) /* Don't run off end */ \ + ? eC - sC /* Length before the <--HERE */ \ + : xI_offset(xC), \ + sC), /* The input pattern printed up to the <--HERE */ \ + UTF8fARG(UTF, \ + (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \ + (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */ /* Used to point after bad bytes for an error message, but avoid skipping * past a nul byte. */ @@ -569,7 +637,7 @@ static const scan_data_t zero_scan_data = */ #define _FAIL(code) STMT_START { \ const char *ellipses = ""; \ - IV len = RExC_end - RExC_precomp; \ + IV len = RExC_precomp_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ SAVEFREESV(RExC_rx_sv); \ @@ -593,10 +661,8 @@ static const scan_data_t zero_scan_data = * Simple_vFAIL -- like FAIL, but marks the current location in the scan */ #define Simple_vFAIL(m) STMT_START { \ - const IV offset = \ - (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(offset)); \ + m, REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* @@ -612,9 +678,8 @@ static const scan_data_t zero_scan_data = * Like Simple_vFAIL(), but accepts two arguments. */ #define Simple_vFAIL2(m,a1) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(offset)); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* @@ -631,9 +696,8 @@ static const scan_data_t zero_scan_data = * Like Simple_vFAIL(), but accepts three arguments. */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(offset)); \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* @@ -649,9 +713,8 @@ static const scan_data_t zero_scan_data = * Like Simple_vFAIL(), but accepts four arguments. */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ - REPORT_LOCATION_ARGS(offset)); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -661,20 +724,18 @@ static const scan_data_t zero_scan_data = } STMT_END /* A specialized version of vFAIL2 that works with UTF8f */ -#define vFAIL2utf8f(m, a1) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ - if (!SIZE_ONLY) \ - SAVEFREESV(RExC_rx_sv); \ - S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(offset)); \ +#define vFAIL2utf8f(m, a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END #define vFAIL3utf8f(m, a1, a2) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ if (!SIZE_ONLY) \ SAVEFREESV(RExC_rx_sv); \ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(offset)); \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* These have asserts in them because of [perl #122671] Many warnings in @@ -685,84 +746,86 @@ static const scan_data_t zero_scan_data = /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN(loc, m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARNregdep(loc,m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ +#define ckWARNregdep(loc,m) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ + WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARN2reg_d(loc,m, a1) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(offset)); \ +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARN2reg(loc, m, a1) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(offset)); \ +#define ckWARN2reg(loc, m, a1) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define vWARN3(loc, m, a1, a2) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, REPORT_LOCATION_ARGS(offset)); \ +#define vWARN3(loc, m, a1, a2) STMT_START { \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, REPORT_LOCATION_ARGS(offset)); \ +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, a4, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END /* Macros for recording node offsets. 20001227 mjd@plover.com @@ -938,6 +1001,151 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); +/* ========================================================= + * BEGIN edit_distance stuff. + * + * This calculates how many single character changes of any type are needed to + * transform a string into another one. It is taken from version 3.1 of + * + * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS + */ + +/* Our unsorted dictionary linked list. */ +/* Note we use UVs, not chars. */ + +struct dictionary{ + UV key; + UV value; + struct dictionary* next; +}; +typedef struct dictionary item; + + +PERL_STATIC_INLINE item* +push(UV key,item* curr) +{ + item* head; + Newxz(head, 1, item); + head->key = key; + head->value = 0; + head->next = curr; + return head; +} + + +PERL_STATIC_INLINE item* +find(item* head, UV key) +{ + item* iterator = head; + while (iterator){ + if (iterator->key == key){ + return iterator; + } + iterator = iterator->next; + } + + return NULL; +} + +PERL_STATIC_INLINE item* +uniquePush(item* head,UV key) +{ + item* iterator = head; + + while (iterator){ + if (iterator->key == key) { + return head; + } + iterator = iterator->next; + } + + return push(key,head); +} + +PERL_STATIC_INLINE void +dict_free(item* head) +{ + item* iterator = head; + + while (iterator) { + item* temp = iterator; + iterator = iterator->next; + Safefree(temp); + } + + head = NULL; +} + +/* End of Dictionary Stuff */ + +/* All calculations/work are done here */ +STATIC int +S_edit_distance(const UV* src, + const UV* tgt, + const STRLEN x, /* length of src[] */ + const STRLEN y, /* length of tgt[] */ + const SSize_t maxDistance +) +{ + item *head = NULL; + UV swapCount,swapScore,targetCharCount,i,j; + UV *scores; + UV score_ceil = x + y; + + PERL_ARGS_ASSERT_EDIT_DISTANCE; + + /* intialize matrix start values */ + Newxz(scores, ( (x + 2) * (y + 2)), UV); + scores[0] = score_ceil; + scores[1 * (y + 2) + 0] = score_ceil; + scores[0 * (y + 2) + 1] = score_ceil; + scores[1 * (y + 2) + 1] = 0; + head = uniquePush(uniquePush(head,src[0]),tgt[0]); + + /* work loops */ + /* i = src index */ + /* j = tgt index */ + for (i=1;i<=x;i++) { + if (i < x) + head = uniquePush(head,src[i]); + scores[(i+1) * (y + 2) + 1] = i; + scores[(i+1) * (y + 2) + 0] = score_ceil; + swapCount = 0; + + for (j=1;j<=y;j++) { + if (i == 1) { + if(j < y) + head = uniquePush(head,tgt[j]); + scores[1 * (y + 2) + (j + 1)] = j; + scores[0 * (y + 2) + (j + 1)] = score_ceil; + } + + targetCharCount = find(head,tgt[j-1])->value; + swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; + + if (src[i-1] != tgt[j-1]){ + scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); + } + else { + swapCount = j; + scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); + } + } + + find(head,src[i-1])->value = i; + } + + { + IV score = scores[(x+1) * (y + 2) + (y + 1)]; + dict_free(head); + Safefree(scores); + return (maxDistance != 0 && maxDistance < score)?(-1):score; + } +} + +/* END of edit_distance() stuff + * ========================================================= */ + /* is c a control character for which we have a mnemonic? */ #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) @@ -1152,7 +1360,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * returned list must, and will, contain every code point that is a * possibility. */ - SV* invlist = sv_2mortal(_new_invlist(0)); + SV* invlist = NULL; SV* only_utf8_locale_invlist = NULL; unsigned int i; const U32 n = ARG(node); @@ -1174,6 +1382,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* Here, no compile-time swash, and there are things that won't be * known until runtime -- we have to assume it could be anything */ + invlist = sv_2mortal(_new_invlist(1)); return _add_range_to_invlist(invlist, 0, UV_MAX); } else if (ary[3] && ary[3] != &PL_sv_undef) { @@ -1184,13 +1393,17 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } /* Get the code points valid only under UTF-8 locales */ - if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + if ((ANYOF_FLAGS(node) & ANYOFL_FOLD) && ary[2] && ary[2] != &PL_sv_undef) { only_utf8_locale_invlist = ary[2]; } } + if (! invlist) { + invlist = sv_2mortal(_new_invlist(0)); + } + /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS * code points, and an inversion list for the others, but if there are code * points that should match only conditionally on the target string being @@ -1210,14 +1423,21 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* Add in the points from the bit map */ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { if (ANYOF_BITMAP_TEST(node, i)) { - invlist = add_cp_to_invlist(invlist, i); + unsigned int start = i++; + + for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); new_node_has_latin1 = TRUE; } } /* If this can match all upper Latin1 code points, have to add them - * as well */ - if (OP(node) == ANYOFD + * as well. But don't add them if inverting, as when that gets done below, + * it would exclude all these characters, including the ones it shouldn't + * that were added just above */ + if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) { _invlist_union(invlist, PL_UpperLatin1, &invlist); @@ -1231,7 +1451,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, if (ANYOF_FLAGS(node) & ANYOF_INVERT) { _invlist_invert(invlist); } - else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) { /* Under /li, any 0-255 could fold to any other 0-255, depending on the * locale. We can skip this if there are no 0-255 at all. */ @@ -1308,7 +1528,12 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { anded_flags = ANYOF_FLAGS(and_with) &( ANYOF_COMMON_FLAGS - |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER); + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER + |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); + if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) { + anded_flags &= + ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } } } @@ -1463,7 +1688,12 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, if (OP(or_with) != ANYOFD) { ored_flags |= ANYOF_FLAGS(or_with) - & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER + |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); + if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) { + ored_flags |= + ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } } } @@ -1608,35 +1838,28 @@ S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) * (unassigned, private use, surrogates, controls and formats). This * is a much large number. */ - const U32 max_match = (LOC) - ? 127 - : (! UNI_SEMANTICS) - ? 63 - : (invlist_highest(ssc->invlist) < 256) - ? 127 - : ((NON_OTHER_COUNT + 1) / 2) - 1; U32 count = 0; /* Running total of number of code points matched by 'ssc' */ UV start, end; /* Start and end points of current range in inversion list */ + const U32 max_code_points = (LOC) + ? 256 + : (( ! UNI_SEMANTICS + || invlist_highest(ssc->invlist) < 256) + ? 128 + : NON_OTHER_COUNT); + const U32 max_match = max_code_points / 2; PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; invlist_iterinit(ssc->invlist); while (invlist_iternext(ssc->invlist, &start, &end)) { - - /* /u is the only thing that we expect to match above 255; so if not /u - * and even if there are matches above 255, ignore them. This catches - * things like \d under /d which does match the digits above 255, but - * since the pattern is /d, it is not likely to be expecting them */ - if (! UNI_SEMANTICS) { - if (start > 255) { - break; - } - end = MIN(end, 255); + if (start >= max_code_points) { + break; } + end = MIN(end, max_code_points - 1); count += end - start + 1; - if (count > max_match) { + if (count >= max_match) { invlist_iterfinish(ssc->invlist); return FALSE; } @@ -1665,7 +1888,8 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) * by the time we reach here */ assert(! (ANYOF_FLAGS(ssc) & ~( ANYOF_COMMON_FLAGS - |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))); + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER + |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP))); populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); @@ -3364,6 +3588,14 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * + * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full + * as possible, even if that means splitting an existing node so that its first + * part is moved to the preceeding node. This would maximise the efficiency of + * memEQ during matching. Elsewhere in this file, khw proposes splitting + * EXACTFish nodes into portions that don't change under folding vs those that + * do. Those portions that don't change may be the only things in the pattern that + * could be used to find fixed and floating strings. + * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character * fold. *min_subtract is set to the total delta number of characters of the @@ -4802,7 +5034,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Quantifier unexpected on zero-length expression " "in regex m/%"UTF8f"/", - UTF8fARG(UTF, RExC_end - RExC_precomp, + UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, RExC_precomp)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -6482,6 +6714,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Initialize these here instead of as-needed, as is quick and avoids * having to test them each time otherwise */ if (! PL_AboveLatin1) { +#ifdef DEBUGGING + char * dump_len_string; +#endif + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); @@ -6495,6 +6731,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_InBitmap = _new_invlist(2); PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, NUM_ANYOF_CODE_POINTS - 1); +#ifdef DEBUGGING + dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); + if ( ! dump_len_string + || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) + { + PL_dump_re_max_len = 0; + } +#endif } pRExC_state->code_blocks = NULL; @@ -6675,11 +6919,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } RExC_precomp = exp; + RExC_precomp_adj = 0; RExC_flags = rx_flags; RExC_pm_flags = pm_flags; if (runtime_code) { - if (TAINTING_get && TAINT_get) + assert(TAINTING_get || !TAINT_get); + if (TAINT_get) Perl_croak(aTHX_ "Eval-group in insecure regular expression"); if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { @@ -6707,8 +6953,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* First pass: determine size, legality. */ RExC_parse = exp; - RExC_start = exp; + RExC_start = RExC_adjusted_start = exp; RExC_end = exp + plen; + RExC_precomp_end = RExC_end; RExC_naughty = 0; RExC_npar = 1; RExC_nestroot = 0; @@ -6728,6 +6975,15 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_recurse_count = 0; pRExC_state->code_index = 0; + /* This NUL is guaranteed because the pattern comes from an SV*, and the sv + * code makes sure the final byte is an uncounted NUL. But should this + * ever not be the case, lots of things could read beyond the end of the + * buffer: loops like + * while(isFOO(*RExC_parse)) RExC_parse++; + * strchr(RExC_parse, "foo"); + * etc. So it is worth noting. */ + assert(*RExC_end == '\0'); + DEBUG_PARSE_r( PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); RExC_lastnum=0; @@ -7929,7 +8185,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) assert (RExC_parse <= RExC_end); if (RExC_parse == RExC_end) NOOP; else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { - /* skip IDFIRST by using do...while */ + /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by + * using do...while */ if (UTF) do { RExC_parse += UTF8SKIP(RExC_parse); @@ -8104,6 +8361,52 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) #ifndef PERL_IN_XSUB_RE +STATIC void +S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) +{ + /* Replaces the inversion list in 'src' with the one in 'dest'. It steals + * the list from 'src', so 'src' is made to have a NULL list. This is + * similar to what SvSetMagicSV() would do, if it were implemented on + * inversion lists, though this routine avoids a copy */ + + const UV src_len = _invlist_len(src); + const bool src_offset = *get_invlist_offset_addr(src); + const STRLEN src_byte_len = SvLEN(src); + char * array = SvPVX(src); + + const int oldtainted = TAINT_get; + + PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; + + assert(SvTYPE(src) == SVt_INVLIST); + assert(SvTYPE(dest) == SVt_INVLIST); + assert(! invlist_is_iterating(src)); + assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); + + /* Make sure it ends in the right place with a NUL, as our inversion list + * manipulations aren't careful to keep this true, but sv_usepvn_flags() + * asserts it */ + array[src_byte_len - 1] = '\0'; + + TAINT_NOT; /* Otherwise it breaks */ + sv_usepvn_flags(dest, + (char *) array, + src_byte_len - 1, + + /* This flag is documented to cause a copy to be avoided */ + SV_HAS_TRAILING_NUL); + TAINT_set(oldtainted); + SvPV_set(src, 0); + SvLEN_set(src, 0); + SvCUR_set(src, 0); + + /* Finish up copying over the other fields in an inversion list */ + *get_invlist_offset_addr(dest) = src_offset; + invlist_set_len(dest, src_len, src_offset); + *get_invlist_previous_index_addr(dest) = 0; + invlist_iterfinish(dest); +} + PERL_STATIC_INLINE IV* S_get_invlist_previous_index_addr(SV* invlist) { @@ -8139,17 +8442,34 @@ S_invlist_set_previous_index(SV* const invlist, const IV index) } PERL_STATIC_INLINE void -S_invlist_trim(SV* const invlist) +S_invlist_trim(SV* invlist) { + /* Free the not currently-being-used space in an inversion list */ + + /* But don't free up the space needed for the 0 UV that is always at the + * beginning of the list, nor the trailing NUL */ + const UV min_size = TO_INTERNAL_SIZE(1) + 1; + PERL_ARGS_ASSERT_INVLIST_TRIM; assert(SvTYPE(invlist) == SVt_INVLIST); - /* Change the length of the inversion list to how many entries it currently - * has */ - SvPV_shrink_to_cur((SV *) invlist); + SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); } +PERL_STATIC_INLINE void +S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ +{ + PERL_ARGS_ASSERT_INVLIST_CLEAR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + invlist_set_len(invlist, 0, 0); + invlist_trim(invlist); +} + +#endif /* ifndef PERL_IN_XSUB_RE */ + PERL_STATIC_INLINE bool S_invlist_is_iterating(SV* const invlist) { @@ -8158,8 +8478,6 @@ S_invlist_is_iterating(SV* const invlist) return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } -#endif /* ifndef PERL_IN_XSUB_RE */ - PERL_STATIC_INLINE UV S_invlist_max(SV* const invlist) { @@ -8372,7 +8690,9 @@ Perl__invlist_search(SV* const invlist, const UV cp) /* Searches the inversion list for the entry that contains the input code * point . If is not in the list, -1 is returned. Otherwise, the * return value is the index into the list's array of the range that - * contains */ + * contains , that is, 'i' such that + * array[i] <= cp < array[i+1] + */ IV low = 0; IV mid; @@ -8391,7 +8711,10 @@ Perl__invlist_search(SV* const invlist, const UV cp) array = invlist_array(invlist); mid = invlist_previous_index(invlist); - assert(mid >=0 && mid <= highest_element); + assert(mid >=0); + if (mid > highest_element) { + mid = highest_element; + } /* contains the cache of the result of the previous call to this * function (0 the first time). See if this call is for the same result, @@ -8555,10 +8878,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise *output will be made correspondingly - * mortal. The first list, , may be NULL, in which case a copy of the - * second list is returned. If is TRUE, the union is taken - * of the complement (inversion) of instead of b itself. + * temporary (mortal); otherwise just its contents will be modified to be + * the union. The first list, , may be NULL, in which case a copy of + * the second list is returned. If is TRUE, the union is + * taken of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -8597,56 +8920,103 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); - /* If either one is empty, the union is the other one */ - if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { - bool make_temp = FALSE; /* Should we mortalize the result? */ + len_b = _invlist_len(b); + if (len_b == 0) { - if (*output == a) { - if (a != NULL) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } + /* Here, 'b' is empty. If the output is the complement of 'b', the + * union is all possible code points, and we need not even look at 'a'. + * It's easiest to create a new inversion list that matches everything. + * */ + if (complement_b) { + SV* everything = _new_invlist(1); + _append_range_to_invlist(everything, 0, UV_MAX); + + /* If the output didn't exist, just point it at the new list */ + if (*output == NULL) { + *output = everything; + return; } - } - if (*output != b) { - *output = invlist_clone(b); - if (complement_b) { - _invlist_invert(*output); + + /* Otherwise, replace its contents with the new list */ + invlist_replace_list_destroys_src(*output, everything); + SvREFCNT_dec_NN(everything); + return; + } + + /* Here, we don't want the complement of 'b', and since it is empty, + * the union will come entirely from 'a'. If 'a' is NULL or empty, the + * output will be empty */ + + if (a == NULL) { + *output = _new_invlist(0); + return; + } + + if (_invlist_len(a) == 0) { + invlist_clear(*output); + return; + } + + /* Here, 'a' is not empty, and entirely determines the union. If the + * output is not to overwrite 'b', we can just return 'a'. */ + if (*output != b) { + + /* If the output is to overwrite 'a', we have a no-op, as it's + * already in 'a' */ + if (*output == a) { + return; } - } /* else *output already = b; */ - if (make_temp) { - sv_2mortal(*output); + /* But otherwise we have to copy 'a' to the output */ + *output = invlist_clone(a); + return; } + + /* Here, 'b' is to be overwritten by the output, which will be 'a' */ + u = invlist_clone(a); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + return; } - else if ((len_b = _invlist_len(b)) == 0) { - bool make_temp = FALSE; - if (*output == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); + + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + + /* Here, 'a' is empty (and b is not). That means the union will come + * entirely from 'b'. If the output is not to overwrite 'a', we can + * just return what's in 'b'. */ + if (*output != a) { + + /* If the output is to overwrite 'b', it's already in 'b', but + * otherwise we have to copy 'b' to the output */ + if (*output != b) { + *output = invlist_clone(b); } - } - /* The complement of an empty list is a list that has everything in it, - * so the union with includes everything too */ - if (complement_b) { - if (a == *output) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } + /* And if the output is to be the inversion of 'b', do that */ + if (complement_b) { + _invlist_invert(*output); } - *output = _new_invlist(1); - _append_range_to_invlist(*output, 0, UV_MAX); + + return; } - else if (*output != a) { - *output = invlist_clone(a); + + /* Here, 'a', which is empty or even NULL, is to be overwritten by the + * output, which will either be 'b' or the complement of 'b' */ + + if (a == NULL) { + *output = invlist_clone(b); } - /* else *output already = a; */ + else { + u = invlist_clone(b); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + } - if (make_temp) { - sv_2mortal(*output); + if (complement_b) { + _invlist_invert(*output); } + return; } @@ -8712,7 +9082,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Here, have chosen which of the two inputs to look at. Only output * if the running count changes to/from 0, which marks the - * beginning/end of a range in that's in the set */ + * beginning/end of a range that's in the set */ if (cp_in_set) { if (count == 0) { array_u[i_u++] = cp; @@ -8746,7 +9116,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; * decrementing to 0 insures that we look at the remainder of the * non-exhausted set */ - if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count--; @@ -8762,8 +9132,9 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, len_u += (len_a - i_a) + (len_b - i_b); } - /* Set result to final length, which can change the pointer to array_u, so - * re-find it */ + /* Set the result to the final length, which can change the pointer to + * array_u, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ if (len_u != _invlist_len(u)) { invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); @@ -8773,7 +9144,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* When 'count' is 0, the list that was exhausted (if one was shorter than * the other) ended with everything above it not in its set. That means * that the remaining part of the union is precisely the same as the - * non-exhausted list, so can just copy it unchanged. (If both list were + * non-exhausted list, so can just copy it unchanged. (If both lists were * exhausted at the same time, then the operations below will be both 0.) */ if (count == 0) { @@ -8786,21 +9157,32 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs. If so, the output - * is made mortal if the input was. (Mortal SVs shouldn't have their ref - * count decremented) */ - if (a == *output || b == *output) { + /* If the output is not to overwrite either of the inputs, just return the + * calculated union */ + if (a != *output && b != *output) { + *output = u; + } + else { + /* Here, the output is to be the same as one of the input scalars, + * hence replacing it. The simple thing to do is to free the input + * scalar, making it instead be the output one. But experience has + * shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. So for that case, replace just the input's interior with + * the output's, and then free the output */ + assert(! invlist_is_iterating(*output)); - if ((SvTEMP(*output))) { - sv_2mortal(u); + + if (! SvTEMP(*output)) { + SvREFCNT_dec_NN(*output); + *output = u; } else { - SvREFCNT_dec_NN(*output); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); } } - *output = u; - return; } @@ -8811,11 +9193,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise *i will be made correspondingly mortal. - * The first list, , may be NULL, in which case an empty list is - * returned. If is TRUE, the result will be the - * intersection of and the complement (or inversion) of instead of - * directly. + * temporary (mortal); otherwise just its contents will be modified to be + * the intersection. The first list, , may be NULL, in which case an + * empty list is returned. If is TRUE, the result will be + * the intersection of and the complement (or inversion) of instead + * of directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -8853,48 +9235,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Special case if either one is empty */ len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { - bool make_temp = FALSE; - if (len_a != 0 && complement_b) { - /* Here, 'a' is not empty, therefore from the above 'if', 'b' must - * be empty. Here, also we are using 'b's complement, which hence - * must be every possible code point. Thus the intersection is - * simply 'a'. */ - if (*i != a) { - if (*i == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); - } - } + /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' + * must be empty. Here, also we are using 'b's complement, which + * hence must be every possible code point. Thus the intersection + * is simply 'a'. */ - *i = invlist_clone(a); + if (*i == a) { /* No-op */ + return; } - /* else *i is already 'a' */ - if (make_temp) { - sv_2mortal(*i); + /* If not overwriting either input, just make a copy of 'a' */ + if (*i != b) { + *i = invlist_clone(a); + return; } + + /* Here we are overwriting 'b' with 'a's contents */ + r = invlist_clone(a); + invlist_replace_list_destroys_src(*i, r); + SvREFCNT_dec_NN(r); return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ - if (*i == a) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } - } - else if (*i == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); - } - } - *i = _new_invlist(0); - if (make_temp) { - sv_2mortal(*i); + if (*i == NULL) { + *i = _new_invlist(0); + return; } + invlist_clear(*i); return; } @@ -8992,7 +9364,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * everything that remains in the non-exhausted set. * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and * remains 1. And the intersection has nothing more. */ - if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count++; @@ -9006,8 +9378,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, len_r += (len_a - i_a) + (len_b - i_b); } - /* Set result to final length, which can change the pointer to array_r, so - * re-find it */ + /* Set the result to the final length, which can change the pointer to + * array_r, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ if (len_r != _invlist_len(r)) { invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); @@ -9025,21 +9398,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs. If so, the output - * is made mortal if the input was. (Mortal SVs shouldn't have their ref - * count decremented) */ - if (a == *i || b == *i) { + /* If the output is not to overwrite either of the inputs, just return the + * calculated intersection */ + if (a != *i && b != *i) { + *i = r; + } + else { + /* Here, the output is to be the same as one of the input scalars, + * hence replacing it. The simple thing to do is to free the input + * scalar, making it instead be the output one. But experience has + * shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. So for that case, replace just the input's interior with + * the output's, and then free the output. A short-cut in this case + * is if the output is empty, we can just set the input to be empty */ + assert(! invlist_is_iterating(*i)); - if (SvTEMP(*i)) { - sv_2mortal(r); + + if (! SvTEMP(*i)) { + SvREFCNT_dec_NN(*i); + *i = r; } else { - SvREFCNT_dec_NN(*i); + if (len_r) { + invlist_replace_list_destroys_src(*i, r); + } + else { + invlist_clear(*i); + } + SvREFCNT_dec_NN(r); } } - *i = r; - return; } @@ -9275,38 +9665,56 @@ S_invlist_highest(SV* const invlist) : array[len - 1] - 1; } -#ifndef PERL_IN_XSUB_RE -SV * -Perl__invlist_contents(pTHX_ SV* const invlist) +STATIC SV * +S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) { /* Get the contents of an inversion list into a string SV so that they can - * be printed out. It uses the format traditionally done for debug tracing - */ + * be printed out. If 'traditional_style' is TRUE, it uses the format + * traditionally done for debug tracing; otherwise it uses a format + * suitable for just copying to the output, with blanks between ranges and + * a dash between range components */ UV start, end; - SV* output = newSVpvs("\n"); + SV* output; + const char intra_range_delimiter = (traditional_style ? '\t' : '-'); + const char inter_range_delimiter = (traditional_style ? '\n' : ' '); + + if (traditional_style) { + output = newSVpvs("\n"); + } + else { + output = newSVpvs(""); + } - PERL_ARGS_ASSERT__INVLIST_CONTENTS; + PERL_ARGS_ASSERT_INVLIST_CONTENTS; assert(! invlist_is_iterating(invlist)); invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c", + start, intra_range_delimiter, + inter_range_delimiter); } else if (end != start) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", - start, end); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c", + start, + intra_range_delimiter, + end, inter_range_delimiter); } else { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c", + start, inter_range_delimiter); } } + if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ + SvCUR_set(output, SvCUR(output) - 1); + } + return output; } -#endif #ifndef PERL_IN_XSUB_RE void @@ -9377,9 +9785,9 @@ Perl__load_PL_utf8_foldclosures (pTHX) } #endif -#ifdef PERL_ARGS_ASSERT__INVLISTEQ +#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) bool -S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { /* Return a boolean as to if the two passed in inversion lists are * identical. The final argument, if TRUE, says to take the complement of @@ -9636,7 +10044,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) cs = REGEX_UNICODE_CHARSET; } - while (*RExC_parse) { + while (RExC_parse < RExC_end) { /* && strchr("iogcmsx", *RExC_parse) */ /* (?g), (?gc) and (?o) are useless here and must be globally applied -- japhy */ @@ -9794,7 +10202,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) NOT_REACHED; /*NOTREACHED*/ } - ++RExC_parse; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; } vFAIL("Sequence (?... not terminated"); @@ -9816,6 +10224,54 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) #endif +PERL_STATIC_INLINE regnode * +S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, + I32 *flagp, + char * parse_start, + char ch + ) +{ + regnode *ret; + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; + + if (RExC_parse == name_start || *RExC_parse != ch) { + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + } + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + + nextchar(pRExC_state); + return ret; +} + /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be @@ -9851,6 +10307,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = 0; /* Tentatively. */ + /* Having this true makes it feasible to have a lot fewer tests for the + * parse pointer being in scope. For example, we can write + * while(isFOO(*RExC_parse)) RExC_parse++; + * instead of + * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++; + */ + assert(*RExC_end == '\0'); /* Make an OPEN node, if parenthesized. */ if (paren) { @@ -9861,40 +10324,49 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * indivisible */ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; + if (RExC_parse >= RExC_end) { + vFAIL("Unmatched ("); + } + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ - char *start_verb = RExC_parse; - STRLEN verb_len = 0; + char *start_verb = RExC_parse + 1; + STRLEN verb_len; char *start_arg = NULL; unsigned char op = 0; int arg_required = 0; int internal_argval = -1; /* if >-1 we are not allowed an argument*/ if (has_intervening_patws) { - RExC_parse++; + RExC_parse++; /* past the '*' */ vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); } - while ( *RExC_parse && *RExC_parse != ')' ) { + while (RExC_parse < RExC_end && *RExC_parse != ')' ) { if ( *RExC_parse == ':' ) { start_arg = RExC_parse + 1; break; } - RExC_parse++; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; } - ++start_verb; verb_len = RExC_parse - start_verb; if ( start_arg ) { - RExC_parse++; - while ( *RExC_parse && *RExC_parse != ')' ) - RExC_parse++; - if ( *RExC_parse != ')' ) + if (RExC_parse >= RExC_end) { + goto unterminated_verb_pattern; + } + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + while ( RExC_parse < RExC_end && *RExC_parse != ')' ) + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) + unterminated_verb_pattern: vFAIL("Unterminated verb pattern argument"); if ( RExC_parse == start_arg ) start_arg = NULL; } else { - if ( *RExC_parse != ')' ) + if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) vFAIL("Unterminated verb pattern"); } + /* Here, we know that RExC_parse < RExC_end */ + switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ if ( memEQs(start_verb,verb_len,"ACCEPT") ) { @@ -9976,56 +10448,37 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("In '(?...)', the '(' and '?' must be adjacent"); } - RExC_parse++; - paren = *RExC_parse++; + RExC_parse++; /* past the '?' */ + paren = *RExC_parse; /* might be a trailing NUL, if not + well-formed */ + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + if (RExC_parse > RExC_end) { + paren = '\0'; + } ret = NULL; /* For look-ahead/behind. */ switch (paren) { case 'P': /* (?P...) variants for those used to PCRE/Python */ - paren = *RExC_parse++; - if ( paren == '<') /* (?P<...>) named capture */ + paren = *RExC_parse; + if ( paren == '<') { /* (?P<...>) named capture */ + RExC_parse++; + if (RExC_parse >= RExC_end) { + vFAIL("Sequence (?P<... not terminated"); + } goto named_capture; + } else if (paren == '>') { /* (?P>name) named recursion */ + RExC_parse++; + if (RExC_parse >= RExC_end) { + vFAIL("Sequence (?P>... not terminated"); + } goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in - * regatom(), if you change this make sure you change that - * */ - char* name_start = RExC_parse; - U32 num = 0; - SV *sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - if (RExC_parse == name_start || *RExC_parse != ')') - /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.3s... not terminated",parse_start); - - if (!SIZE_ONLY) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); - RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc_simple_void(sv_dat); - } - RExC_sawback = 1; - ret = reganode(pRExC_state, - ((! FOLD) - ? NREF - : (ASCII_FOLD_RESTRICTED) - ? NREFFA - : (AT_LEAST_UNI_SEMANTICS) - ? NREFFU - : (LOC) - ? NREFFL - : NREFF), - num); - *flagp |= HASWIDTH; - - Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret, parse_start); - - nextchar(pRExC_state); - return ret; + RExC_parse++; + return handle_named_backref(pRExC_state, flagp, + parse_start, ')'); } - --RExC_parse; RExC_parse += SKIP_IF_CHAR(RExC_parse); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL3("Sequence (%.*s...) not recognized", @@ -10040,15 +10493,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *name_start; SV *svname; paren= '>'; + /* FALLTHROUGH */ case '\'': /* (?'...') */ - name_start= RExC_parse; - svname = reg_scan_name(pRExC_state, + name_start = RExC_parse; + svname = reg_scan_name(pRExC_state, SIZE_ONLY /* reverse test from the others */ ? REG_RSN_RETURN_NAME : REG_RSN_RETURN_NULL); - if (RExC_parse == name_start || *RExC_parse != paren) + if ( RExC_parse == name_start + || RExC_parse >= RExC_end + || *RExC_parse != paren) + { vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); + } if (SIZE_ONLY) { HE *he_str; SV *sv_dat = NULL; @@ -10116,6 +10574,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; + assert(RExC_parse < RExC_end); /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; @@ -10163,7 +10622,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } - if (RExC_parse == RExC_end || *RExC_parse != ')') + if (RExC_parse >= RExC_end || *RExC_parse != ')') vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; /* NOTREACHED */ @@ -10182,7 +10641,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': - RExC_parse--; + RExC_parse = (char *) seqstart + 1; /* Point to the digit */ parse_recursion: { bool is_neg = FALSE; @@ -10262,7 +10721,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) NOT_REACHED; /*NOTREACHED*/ } *flagp |= POSTPONED; - paren = *RExC_parse++; + paren = '{'; + RExC_parse++; /* FALLTHROUGH */ case '{': /* (?{...}) */ { @@ -10329,11 +10789,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) int is_define= 0; const int DEFINE_len = sizeof("DEFINE") - 1; if (RExC_parse[0] == '?') { /* (?(?...)) */ - if ( - RExC_parse[1] == '=' || - RExC_parse[1] == '!' || - RExC_parse[1] == '<' || - RExC_parse[1] == '{' + if ( RExC_parse < RExC_end - 1 + && ( RExC_parse[1] == '=' + || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') ) { /* Lookahead or eval. */ I32 flag; regnode *tail; @@ -10361,9 +10821,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) U32 num = 0; SV *sv_dat=reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - if (RExC_parse == name_start || *RExC_parse != ch) + if ( RExC_parse == name_start + || RExC_parse >= RExC_end + || *RExC_parse != ch) + { vFAIL2("Sequence (?(%c... not terminated", (ch == '>' ? '<' : ch)); + } RExC_parse++; if (!SIZE_ONLY) { num = add_data( pRExC_state, STR_WITH_LEN("S")); @@ -10467,7 +10931,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else lastbr = NULL; if (c != ')') { - if (RExC_parse>RExC_end) + if (RExC_parse >= RExC_end) vFAIL("Switch (?(condition)... not terminated"); else vFAIL("Switch (?(condition)... contains too many branches"); @@ -10491,16 +10955,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '[': /* (?[ ... ]) */ return handle_regex_sets(pRExC_state, NULL, flagp, depth, oregcomp_parse); - case 0: + case 0: /* A NUL */ RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); break; default: /* e.g., (?i) */ - --RExC_parse; + RExC_parse = (char *) seqstart + 1; parse_flags: parse_lparen_question_flags(pRExC_state); if (UCHARAT(RExC_parse) != ':') { - if (*RExC_parse) + if (RExC_parse < RExC_end) nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; @@ -10936,7 +11400,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* We can't back off the size because we have to reserve * enough space for all the things we are about to throw - * away, but we can shrink it by the ammount we are about + * away, but we can shrink it by the amount we are about * to re-use here */ RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; } @@ -10947,15 +11411,13 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reganode(pRExC_state, OPFAIL, 0); return ret; } - else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?') + else if (min == max && *RExC_parse == '?') { if (PASS2) { ckWARN2reg(RExC_parse + 1, "Useless use of greediness modifier '%c'", *RExC_parse); } - /* Absorb the modifier, so later code doesn't see nor use it */ - nextchar(pRExC_state); } do_curly: @@ -11071,13 +11533,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) (void)ReREFCNT_inc(RExC_rx_sv); } - if (RExC_parse < RExC_end && *RExC_parse == '?') { + if (*RExC_parse == '?') { nextchar(pRExC_state); reginsert(pRExC_state, MINMOD, ret, depth+1); REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); } - else - if (RExC_parse < RExC_end && *RExC_parse == '+') { + else if (*RExC_parse == '+') { regnode *ender; nextchar(pRExC_state); ender = reg_node(pRExC_state, SUCCEED); @@ -11088,7 +11549,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) REGTAIL(pRExC_state, ret, ender); } - if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + if (ISMULT2(RExC_parse)) { RExC_parse++; vFAIL("Nested quantifiers"); } @@ -11102,6 +11563,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, UV * code_point_p, int * cp_count, I32 * flagp, + const bool strict, const U32 depth ) { @@ -11251,6 +11713,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, semantics */ if (endbrace == RExC_parse) { /* empty: \N{} */ + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } if (cp_count) { *cp_count = 0; } @@ -11265,6 +11731,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_parse += 2; /* Skip past the 'U+' */ + /* Because toke.c has generated a special construct for us guaranteed not + * to have NULs, we can use a str function */ endchar = RExC_parse + strcspn(RExC_parse, ".}"); /* Code points are separated by dots. If none, there is only one code @@ -11324,6 +11792,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, SV * substitute_parse; STRLEN len; char *orig_end = RExC_end; + char *save_start = RExC_start; I32 flags; /* Count the code points, if desired, in the sequence */ @@ -11369,7 +11838,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, } sv_catpv(substitute_parse, ")"); - RExC_parse = SvPV(substitute_parse, len); + RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse, + len); /* Don't allow empty number */ if (len < (STRLEN) 8) { @@ -11399,6 +11869,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, } /* Restore the saved values */ + RExC_start = RExC_adjusted_start = save_start; RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; @@ -11425,10 +11896,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * it returns U+FFFD (Replacement character) and sets *encp to NULL. */ STATIC UV -S_reg_recode(pTHX_ const char value, SV **encp) +S_reg_recode(pTHX_ const U8 value, SV **encp) { STRLEN numlen = 1; - SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP); const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -11743,6 +12214,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) tryagain: parse_start = RExC_parse; + assert(RExC_parse < RExC_end); switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; @@ -11782,6 +12254,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FALSE, /* don't silence non-portable warnings. */ (bool) RExC_strict, TRUE, /* Allow an optimized regnode result */ + NULL, NULL); if (ret == NULL) { if (*flagp & (RESTART_PASS1|NEED_UTF8)) @@ -11802,7 +12275,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg(pRExC_state, 2, &flags,depth+1); if (ret == NULL) { if (flags & TRYAGAIN) { - if (RExC_parse == RExC_end) { + if (RExC_parse >= RExC_end) { /* Make parent create an empty node if needed. */ *flagp |= TRYAGAIN; return(NULL); @@ -11846,7 +12319,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) required, as the default for this switch is to jump to the literal text handling code. */ - switch ((U8)*++RExC_parse) { + RExC_parse++; + switch ((U8)*RExC_parse) { /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -11914,7 +12388,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, op); *flagp |= SIMPLE; - if (*(RExC_parse + 1) != '{') { + if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { FLAGS(ret) = TRADITIONAL_BOUND; if (PASS2 && op > BOUNDA) { /* /aa is same as /a */ OP(ret) = BOUNDA; @@ -11953,6 +12427,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } FLAGS(ret) = GCB_BOUND; break; + case 'l': + if (length != 2 || *(RExC_parse + 1) != 'b') { + goto bad_bound_type; + } + FLAGS(ret) = LB_BOUND; + break; case 's': if (length != 2 || *(RExC_parse + 1) != 'b') { goto bad_bound_type; @@ -12080,6 +12560,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) non-portables */ (bool) RExC_strict, TRUE, /* Allow an optimized regnode result */ + NULL, NULL); if (*flagp & RESTART_PASS1) return NULL; @@ -12115,6 +12596,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NULL, /* Don't need a count of how many code points */ flagp, + RExC_strict, depth) ) { break; @@ -12130,48 +12612,25 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: { - char ch= RExC_parse[1]; - if (ch != '<' && ch != '\'' && ch != '{') { + char ch; + if ( RExC_parse >= RExC_end - 1 + || (( ch = RExC_parse[1]) != '<' + && ch != '\'' + && ch != '{')) + { RExC_parse++; /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { - /* this pretty much dupes the code for (?P=...) in reg(), if - you change this make sure you change that */ - char* name_start = (RExC_parse += 2); - U32 num = 0; - SV *sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; - if (RExC_parse == name_start || *RExC_parse != ch) - /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.3s... not terminated",parse_start); - - if (!SIZE_ONLY) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); - RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc_simple_void(sv_dat); - } - - RExC_sawback = 1; - ret = reganode(pRExC_state, - ((! FOLD) - ? NREF - : (ASCII_FOLD_RESTRICTED) - ? NREFFA - : (AT_LEAST_UNI_SEMANTICS) - ? NREFFU - : (LOC) - ? NREFFL - : NREFF), - num); - *flagp |= HASWIDTH; - - /* override incorrect value set in reganode MJD */ - Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); - + RExC_parse += 2; + ret = handle_named_backref(pRExC_state, + flagp, + parse_start, + (ch == '<') + ? '>' + : (ch == '{') + ? '}' + : '\''); } break; } @@ -12200,6 +12659,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto parse_named_seq; } + if (RExC_parse >= RExC_end) { + goto unterminated_g; + } num = S_backref_value(RExC_parse); if (num == 0) vFAIL("Reference to invalid group 0"); @@ -12207,6 +12669,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (isDIGIT(*RExC_parse)) vFAIL("Reference to nonexistent group"); else + unterminated_g: vFAIL("Unterminated \\g... pattern"); } @@ -12358,7 +12821,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reparse: /* We look for the EXACTFish to EXACT node optimizaton only if - * folding. (And we don't need to figure this out until pass 2) */ + * folding. (And we don't need to figure this out until pass 2). + * XXX It might actually make sense to split the node into portions + * that are exact and ones that aren't, so that we could later use + * the exact ones to find the longest fixed and floating strings. + * One would want to join them back into a larger node. One could + * use a pseudo regnode like 'EXACT_ORIG_FOLD' */ maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to @@ -12455,6 +12923,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NULL, /* Don't need a count of how many code points */ flagp, + RExC_strict, depth) ) { if (*flagp & NEED_UTF8) @@ -12605,7 +13074,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) p += numlen; if (PASS2 /* like \08, \178 */ && numlen < 3 - && p < RExC_end && isDIGIT(*p) && ckWARN(WARN_REGEXP)) { reg_warn_non_literal_string( @@ -12619,7 +13087,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) recode_encoding: if (! RExC_override_recoding) { SV* enc = _get_encoding(); - ender = reg_recode((const char)(U8)ender, &enc); + ender = reg_recode((U8)ender, &enc); if (!enc && PASS2) ckWARNreg(p, "Invalid escape in the specified encoding"); REQUIRE_UTF8(flagp); @@ -12747,14 +13215,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto not_fold_common; } else /* A regular FOLD code point */ - if (! ( UTF + if (! ( UTF #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ || UNICODE_DOT_DOT_VERSION > 0) - /* See comments for join_exact() as to why we fold this - * non-UTF at compile time */ - || (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S) + /* See comments for join_exact() as to why we fold + * this non-UTF at compile time */ + || ( node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S) #endif )) { /* Here, are folding and are not UTF-8 encoded; therefore @@ -13097,9 +13565,6 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; } - else if (end >= NUM_ANYOF_CODE_POINTS) { - ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES; - } /* Quit if are above what we should change */ if (start >= NUM_ANYOF_CODE_POINTS) { @@ -13147,202 +13612,816 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) #define POSIXCC_DONE(c) ((c) == ':') #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) +#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';') + +#define WARNING_PREFIX "Assuming NOT a POSIX class since " +#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one" +#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon" + +#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1) + +/* 'posix_warnings' and 'warn_text' are names of variables in the following + * routine. q.v. */ +#define ADD_POSIX_WARNING(p, text) STMT_START { \ + if (posix_warnings) { \ + if (! warn_text) warn_text = newAV(); \ + av_push(warn_text, Perl_newSVpvf(aTHX_ \ + WARNING_PREFIX \ + text \ + REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(p))); \ + } \ + } STMT_END -PERL_STATIC_INLINE I32 -S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) +STATIC int +S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, + + const char * const s, /* Where the putative posix class begins. + Normally, this is one past the '['. This + parameter exists so it can be somewhere + besides RExC_parse. */ + char ** updated_parse_ptr, /* Where to set the updated parse pointer, or + NULL */ + AV ** posix_warnings, /* Where to place any generated warnings, or + NULL */ + const bool check_only /* Don't die if error */ +) { - I32 namedclass = OOB_NAMEDCLASS; + /* This parses what the caller thinks may be one of the three POSIX + * constructs: + * 1) a character class, like [:blank:] + * 2) a collating symbol, like [. .] + * 3) an equivalence class, like [= =] + * In the latter two cases, it croaks if it finds a syntactically legal + * one, as these are not handled by Perl. + * + * The main purpose is to look for a POSIX character class. It returns: + * a) the class number + * if it is a completely syntactically and semantically legal class. + * 'updated_parse_ptr', if not NULL, is set to point to just after the + * closing ']' of the class + * b) OOB_NAMEDCLASS + * if it appears that one of the three POSIX constructs was meant, but + * its specification was somehow defective. 'updated_parse_ptr', if + * not NULL, is set to point to the character just after the end + * character of the class. See below for handling of warnings. + * c) NOT_MEANT_TO_BE_A_POSIX_CLASS + * if it doesn't appear that a POSIX construct was intended. + * 'updated_parse_ptr' is not changed. No warnings nor errors are + * raised. + * + * In b) there may be errors or warnings generated. If 'check_only' is + * TRUE, then any errors are discarded. Warnings are returned to the + * caller via an AV* created into '*posix_warnings' if it is not NULL. If + * instead it is NULL, warnings are suppressed. This is done in all + * passes. The reason for this is that the rest of the parsing is heavily + * dependent on whether this routine found a valid posix class or not. If + * it did, the closing ']' is absorbed as part of the class. If no class, + * or an invalid one is found, any ']' will be considered the terminator of + * the outer bracketed character class, leading to very different results. + * In particular, a '(?[ ])' construct will likely have a syntax error if + * the class is parsed other than intended, and this will happen in pass1, + * before the warnings would normally be output. This mechanism allows the + * caller to output those warnings in pass1 just before dieing, giving a + * much better clue as to what is wrong. + * + * The reason for this function, and its complexity is that a bracketed + * character class can contain just about anything. But it's easy to + * mistype the very specific posix class syntax but yielding a valid + * regular bracketed class, so it silently gets compiled into something + * quite unintended. + * + * The solution adopted here maintains backward compatibility except that + * it adds a warning if it looks like a posix class was intended but + * improperly specified. The warning is not raised unless what is input + * very closely resembles one of the 14 legal posix classes. To do this, + * it uses fuzzy parsing. It calculates how many single-character edits it + * would take to transform what was input into a legal posix class. Only + * if that number is quite small does it think that the intention was a + * posix class. Obviously these are heuristics, and there will be cases + * where it errs on one side or another, and they can be tweaked as + * experience informs. + * + * The syntax for a legal posix class is: + * + * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/ + * + * What this routine considers syntactically to be an intended posix class + * is this (the comments indicate some restrictions that the pattern + * doesn't show): + * + * qr/(?x: \[? # The left bracket, possibly + * # omitted + * \h* # possibly followed by blanks + * (?: \^ \h* )? # possibly a misplaced caret + * [:;]? # The opening class character, + * # possibly omitted. A typo + * # semi-colon can also be used. + * \h* + * \^? # possibly a correctly placed + * # caret, but not if there was also + * # a misplaced one + * \h* + * .{3,15} # The class name. If there are + * # deviations from the legal syntax, + * # its edit distance must be close + * # to a real class name in order + * # for it to be considered to be + * # an intended posix class. + * \h* + * [:punct:]? # The closing class character, + * # possibly omitted. If not a colon + * # nor semi colon, the class name + * # must be even closer to a valid + * # one + * \h* + * \]? # The right bracket, possibly + * # omitted. + * )/ + * + * In the above, \h must be ASCII-only. + * + * These are heuristics, and can be tweaked as field experience dictates. + * There will be cases when someone didn't intend to specify a posix class + * that this warns as being so. The goal is to minimize these, while + * maximizing the catching of things intended to be a posix class that + * aren't parsed as such. + */ + + const char* p = s; + const char * const e = RExC_end; + unsigned complement = 0; /* If to complement the class */ + bool found_problem = FALSE; /* Assume OK until proven otherwise */ + bool has_opening_bracket = FALSE; + bool has_opening_colon = FALSE; + int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find + valid class */ + AV* warn_text = NULL; /* any warning messages */ + const char * possible_end = NULL; /* used for a 2nd parse pass */ + const char* name_start; /* ptr to class name first char */ + + /* If the number of single-character typos the input name is away from a + * legal name is no more than this number, it is considered to have meant + * the legal name */ + int max_distance = 2; + + /* to store the name. The size determines the maximum length before we + * decide that no posix class was intended. Should be at least + * sizeof("alphanumeric") */ + UV input_text[15]; + + PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; + + if (p >= e) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + if (*(p - 1) != '[') { + ADD_POSIX_WARNING(p, "it doesn't start with a '['"); + found_problem = TRUE; + } + else { + has_opening_bracket = TRUE; + } + + /* They could be confused and think you can put spaces between the + * components */ + if (isBLANK(*p)) { + found_problem = TRUE; + + do { + p++; + } while (p < e && isBLANK(*p)); - PERL_ARGS_ASSERT_REGPPOSIXCC; + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); + } - if (value == '[' && RExC_parse + 1 < RExC_end && - /* I smell either [: or [= or [. -- POSIX has been here, right? */ - POSIXCC(UCHARAT(RExC_parse))) + /* For [. .] and [= =]. These are quite different internally from [: :], + * so they are handled separately. */ + if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' + and 1 for at least one char in it + */ { - const char c = UCHARAT(RExC_parse); - char* const s = RExC_parse++; + const char open_char = *p; + const char * temp_ptr = p + 1; + + /* These two constructs are not handled by perl, and if we find a + * syntactically valid one, we croak. khw, who wrote this code, finds + * this explanation of them very unclear: + * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html + * And searching the rest of the internet wasn't very helpful either. + * It looks like just about any byte can be in these constructs, + * depending on the locale. But unless the pattern is being compiled + * under /l, which is very rare, Perl runs under the C or POSIX locale. + * In that case, it looks like [= =] isn't allowed at all, and that + * [. .] could be any single code point, but for longer strings the + * constituent characters would have to be the ASCII alphabetics plus + * the minus-hyphen. Any sensible locale definition would limit itself + * to these. And any portable one definitely should. Trying to parse + * the general case is a nightmare (see [perl #127604]). So, this code + * looks only for interiors of these constructs that match: + * qr/.|[-\w]{2,}/ + * Using \w relaxes the apparent rules a little, without adding much + * danger of mistaking something else for one of these constructs. + * + * [. .] in some implementations described on the internet is usable to + * escape a character that otherwise is special in bracketed character + * classes. For example [.].] means a literal right bracket instead of + * the ending of the class + * + * [= =] can legitimately contain a [. .] construct, but we don't + * handle this case, as that [. .] construct will later get parsed + * itself and croak then. And [= =] is checked for even when not under + * /l, as Perl has long done so. + * + * The code below relies on there being a trailing NUL, so it doesn't + * have to keep checking if the parse ptr < e. + */ + if (temp_ptr[1] == open_char) { + temp_ptr++; + } + else while ( temp_ptr < e + && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) + { + temp_ptr++; + } - while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) - RExC_parse++; - if (RExC_parse == RExC_end) { - if (strict) { + if (*temp_ptr == open_char) { + temp_ptr++; + if (*temp_ptr == ']') { + temp_ptr++; + if (! found_problem && ! check_only) { + RExC_parse = (char *) temp_ptr; + vFAIL3("POSIX syntax [%c %c] is reserved for future " + "extensions", open_char, open_char); + } - /* Try to give a better location for the error (than the end of - * the string) by looking for the matching ']' */ - RExC_parse = s; - while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { - RExC_parse++; + /* Here, the syntax wasn't completely valid, or else the call + * is to check-only */ + if (updated_parse_ptr) { + *updated_parse_ptr = (char *) temp_ptr; } - vFAIL2("Unmatched '%c' in POSIX class", c); + + return OOB_NAMEDCLASS; } - /* Grandfather lone [:, [=, [. */ - RExC_parse = s; } - else { - const char* const t = RExC_parse++; /* skip over the c */ - assert(*t == c); - - if (UCHARAT(RExC_parse) == ']') { - const char *posixcc = s + 1; - RExC_parse++; /* skip over the ending ] */ - - if (*s == ':') { - const I32 complement = *posixcc == '^' ? *posixcc++ : 0; - const I32 skip = t - posixcc; - - /* Initially switch on the length of the name. */ - switch (skip) { - case 4: - if (memEQ(posixcc, "word", 4)) /* this is not POSIX, - this is the Perl \w - */ - namedclass = ANYOF_WORDCHAR; - break; - case 5: - /* Names all of length 5. */ - /* alnum alpha ascii blank cntrl digit graph lower - print punct space upper */ - /* Offset 4 gives the best switch position. */ - switch (posixcc[4]) { - case 'a': - if (memEQ(posixcc, "alph", 4)) /* alpha */ - namedclass = ANYOF_ALPHA; - break; - case 'e': - if (memEQ(posixcc, "spac", 4)) /* space */ - namedclass = ANYOF_SPACE; - break; - case 'h': - if (memEQ(posixcc, "grap", 4)) /* graph */ - namedclass = ANYOF_GRAPH; - break; - case 'i': - if (memEQ(posixcc, "asci", 4)) /* ascii */ - namedclass = ANYOF_ASCII; - break; - case 'k': - if (memEQ(posixcc, "blan", 4)) /* blank */ - namedclass = ANYOF_BLANK; - break; - case 'l': - if (memEQ(posixcc, "cntr", 4)) /* cntrl */ - namedclass = ANYOF_CNTRL; - break; - case 'm': - if (memEQ(posixcc, "alnu", 4)) /* alnum */ - namedclass = ANYOF_ALPHANUMERIC; - break; - case 'r': - if (memEQ(posixcc, "lowe", 4)) /* lower */ - namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; - else if (memEQ(posixcc, "uppe", 4)) /* upper */ - namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; - break; - case 't': - if (memEQ(posixcc, "digi", 4)) /* digit */ - namedclass = ANYOF_DIGIT; - else if (memEQ(posixcc, "prin", 4)) /* print */ - namedclass = ANYOF_PRINT; - else if (memEQ(posixcc, "punc", 4)) /* punct */ - namedclass = ANYOF_PUNCT; - break; - } - break; - case 6: - if (memEQ(posixcc, "xdigit", 6)) - namedclass = ANYOF_XDIGIT; - break; - } - if (namedclass == OOB_NAMEDCLASS) - vFAIL2utf8f( - "POSIX class [:%"UTF8f":] unknown", - UTF8fARG(UTF, t - s - 1, s + 1)); + /* If we find something that started out to look like one of these + * constructs, but isn't, we continue below so that it can be checked + * for being a class name with a typo of '.' or '=' instead of a colon. + * */ + } - /* The #defines are structured so each complement is +1 to - * the normal one */ - if (complement) { - namedclass++; - } - assert (posixcc[skip] == ':'); - assert (posixcc[skip+1] == ']'); - } else if (!SIZE_ONLY) { - /* [[=foo=]] and [[.foo.]] are still future. */ - - /* adjust RExC_parse so the warning shows after - the class closes */ - while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') - RExC_parse++; - vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); - } - } else { - /* Maternal grandfather: - * "[:" ending in ":" but not in ":]" */ - if (strict) { - vFAIL("Unmatched '[' in POSIX class"); - } + /* Here, we think there is a possibility that a [: :] class was meant, and + * we have the first real character. It could be they think the '^' comes + * first */ + if (*p == '^') { + found_problem = TRUE; + ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon"); + complement = 1; + p++; - /* Grandfather lone [:, [=, [. */ - RExC_parse = s; - } - } + if (isBLANK(*p)) { + found_problem = TRUE; + + do { + p++; + } while (p < e && isBLANK(*p)); + + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); + } } - return namedclass; -} + /* But the first character should be a colon, which they could have easily + * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to + * distinguish from a colon, so treat that as a colon). */ + if (*p == ':') { + p++; + has_opening_colon = TRUE; + } + else if (*p == ';') { + found_problem = TRUE; + p++; + ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); + has_opening_colon = TRUE; + } + else { + found_problem = TRUE; + ADD_POSIX_WARNING(p, "there must be a starting ':'"); -STATIC bool -S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) -{ - /* This applies some heuristics at the current parse position (which should - * be at a '[') to see if what follows might be intended to be a [:posix:] - * class. It returns true if it really is a posix class, of course, but it - * also can return true if it thinks that what was intended was a posix - * class that didn't quite make it. - * - * It will return true for - * [:alphanumerics: - * [:alphanumerics] (as long as the ] isn't followed immediately by a - * ')' indicating the end of the (?[ - * [:any garbage including %^&$ punctuation:] - * - * This is designed to be called only from S_handle_regex_sets; it could be - * easily adapted to be called from the spot at the beginning of regclass() - * that checks to see in a normal bracketed class if the surrounding [] - * have been omitted ([:word:] instead of [[:word:]]). But doing so would - * change long-standing behavior, so I (khw) didn't do that */ - char* p = RExC_parse + 1; - char first_char = *p; + /* Consider an initial punctuation (not one of the recognized ones) to + * be a left terminator */ + if (*p != '^' && *p != ']' && isPUNCT(*p)) { + p++; + } + } - PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; + /* They may think that you can put spaces between the components */ + if (isBLANK(*p)) { + found_problem = TRUE; - assert(*(p - 1) == '['); + do { + p++; + } while (p < e && isBLANK(*p)); - if (! POSIXCC(first_char)) { - return FALSE; + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); } - p++; - while (p < RExC_end && isWORDCHAR(*p)) p++; + if (*p == '^') { - if (p >= RExC_end) { - return FALSE; + /* We consider something like [^:^alnum:]] to not have been intended to + * be a posix class, but XXX maybe we should */ + if (complement) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + complement = 1; + p++; } - if (p - RExC_parse > 2 /* Got at least 1 word character */ - && (*p == first_char - || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) - { - return TRUE; + /* Again, they may think that you can put spaces between the components */ + if (isBLANK(*p)) { + found_problem = TRUE; + + do { + p++; + } while (p < e && isBLANK(*p)); + + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); } - p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); + if (*p == ']') { + + /* XXX This ']' may be a typo, and something else was meant. But + * treating it as such creates enough complications, that that + * possibility isn't currently considered here. So we assume that the + * ']' is what is intended, and if we've already found an initial '[', + * this leaves this construct looking like [:] or [:^], which almost + * certainly weren't intended to be posix classes */ + if (has_opening_bracket) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* But this function can be called when we parse the colon for + * something like qr/[alpha:]]/, so we back up to look for the + * beginning */ + p--; + + if (*p == ';') { + found_problem = TRUE; + ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); + } + else if (*p != ':') { + + /* XXX We are currently very restrictive here, so this code doesn't + * consider the possibility that, say, /[alpha.]]/ was intended to + * be a posix class. */ + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* Here we have something like 'foo:]'. There was no initial colon, + * and we back up over 'foo. XXX Unlike the going forward case, we + * don't handle typos of non-word chars in the middle */ + has_opening_colon = FALSE; + p--; + + while (p > RExC_start && isWORDCHAR(*p)) { + p--; + } + p++; + + /* Here, we have positioned ourselves to where we think the first + * character in the potential class is */ + } + + /* Now the interior really starts. There are certain key characters that + * can end the interior, or these could just be typos. To catch both + * cases, we may have to do two passes. In the first pass, we keep on + * going unless we come to a sequence that matches + * qr/ [[:punct:]] [[:blank:]]* \] /xa + * This means it takes a sequence to end the pass, so two typos in a row if + * that wasn't what was intended. If the class is perfectly formed, just + * this one pass is needed. We also stop if there are too many characters + * being accumulated, but this number is deliberately set higher than any + * real class. It is set high enough so that someone who thinks that + * 'alphanumeric' is a correct name would get warned that it wasn't. + * While doing the pass, we keep track of where the key characters were in + * it. If we don't find an end to the class, and one of the key characters + * was found, we redo the pass, but stop when we get to that character. + * Thus the key character was considered a typo in the first pass, but a + * terminator in the second. If two key characters are found, we stop at + * the second one in the first pass. Again this can miss two typos, but + * catches a single one + * + * In the first pass, 'possible_end' starts as NULL, and then gets set to + * point to the first key character. For the second pass, it starts as -1. + * */ + + name_start = p; + parse_name: + { + bool has_blank = FALSE; + bool has_upper = FALSE; + bool has_terminating_colon = FALSE; + bool has_terminating_bracket = FALSE; + bool has_semi_colon = FALSE; + unsigned int name_len = 0; + int punct_count = 0; + + while (p < e) { + + /* Squeeze out blanks when looking up the class name below */ + if (isBLANK(*p) ) { + has_blank = TRUE; + found_problem = TRUE; + p++; + continue; + } + + /* The name will end with a punctuation */ + if (isPUNCT(*p)) { + const char * peek = p + 1; + + /* Treat any non-']' punctuation followed by a ']' (possibly + * with intervening blanks) as trying to terminate the class. + * ']]' is very likely to mean a class was intended (but + * missing the colon), but the warning message that gets + * generated shows the error position better if we exit the + * loop at the bottom (eventually), so skip it here. */ + if (*p != ']') { + if (peek < e && isBLANK(*peek)) { + has_blank = TRUE; + found_problem = TRUE; + do { + peek++; + } while (peek < e && isBLANK(*peek)); + } + + if (peek < e && *peek == ']') { + has_terminating_bracket = TRUE; + if (*p == ':') { + has_terminating_colon = TRUE; + } + else if (*p == ';') { + has_semi_colon = TRUE; + has_terminating_colon = TRUE; + } + else { + found_problem = TRUE; + } + p = peek + 1; + goto try_posix; + } + } + + /* Here we have punctuation we thought didn't end the class. + * Keep track of the position of the key characters that are + * more likely to have been class-enders */ + if (*p == ']' || *p == '[' || *p == ':' || *p == ';') { + + /* Allow just one such possible class-ender not actually + * ending the class. */ + if (possible_end) { + break; + } + possible_end = p; + } + + /* If we have too many punctuation characters, no use in + * keeping going */ + if (++punct_count > max_distance) { + break; + } + + /* Treat the punctuation as a typo. */ + input_text[name_len++] = *p; + p++; + } + else if (isUPPER(*p)) { /* Use lowercase for lookup */ + input_text[name_len++] = toLOWER(*p); + has_upper = TRUE; + found_problem = TRUE; + p++; + } else if (! UTF || UTF8_IS_INVARIANT(*p)) { + input_text[name_len++] = *p; + p++; + } + else { + input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL); + p+= UTF8SKIP(p); + } + + /* The declaration of 'input_text' is how long we allow a potential + * class name to be, before saying they didn't mean a class name at + * all */ + if (name_len >= C_ARRAY_LENGTH(input_text)) { + break; + } + } + + /* We get to here when the possible class name hasn't been properly + * terminated before: + * 1) we ran off the end of the pattern; or + * 2) found two characters, each of which might have been intended to + * be the name's terminator + * 3) found so many punctuation characters in the purported name, + * that the edit distance to a valid one is exceeded + * 4) we decided it was more characters than anyone could have + * intended to be one. */ + + found_problem = TRUE; + + /* In the final two cases, we know that looking up what we've + * accumulated won't lead to a match, even a fuzzy one. */ + if ( name_len >= C_ARRAY_LENGTH(input_text) + || punct_count > max_distance) + { + /* If there was an intermediate key character that could have been + * an intended end, redo the parse, but stop there */ + if (possible_end && possible_end != (char *) -1) { + possible_end = (char *) -1; /* Special signal value to say + we've done a first pass */ + p = name_start; + goto parse_name; + } + + /* Otherwise, it can't have meant to have been a class */ + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* If we ran off the end, and the final character was a punctuation + * one, back up one, to look at that final one just below. Later, we + * will restore the parse pointer if appropriate */ + if (name_len && p == e && isPUNCT(*(p-1))) { + p--; + name_len--; + } + + if (p < e && isPUNCT(*p)) { + if (*p == ']') { + has_terminating_bracket = TRUE; + + /* If this is a 2nd ']', and the first one is just below this + * one, consider that to be the real terminator. This gives a + * uniform and better positioning for the warning message */ + if ( possible_end + && possible_end != (char *) -1 + && *possible_end == ']' + && name_len && input_text[name_len - 1] == ']') + { + name_len--; + p = possible_end; + + /* And this is actually equivalent to having done the 2nd + * pass now, so set it to not try again */ + possible_end = (char *) -1; + } + } + else { + if (*p == ':') { + has_terminating_colon = TRUE; + } + else if (*p == ';') { + has_semi_colon = TRUE; + has_terminating_colon = TRUE; + } + p++; + } + } + + try_posix: + + /* Here, we have a class name to look up. We can short circuit the + * stuff below for short names that can't possibly be meant to be a + * class name. (We can do this on the first pass, as any second pass + * will yield an even shorter name) */ + if (name_len < 3) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* Find which class it is. Initially switch on the length of the name. + * */ + switch (name_len) { + case 4: + if (memEQ(name_start, "word", 4)) { + /* this is not POSIX, this is the Perl \w */ + class_number = ANYOF_WORDCHAR; + } + break; + case 5: + /* Names all of length 5: alnum alpha ascii blank cntrl digit + * graph lower print punct space upper + * Offset 4 gives the best switch position. */ + switch (name_start[4]) { + case 'a': + if (memEQ(name_start, "alph", 4)) /* alpha */ + class_number = ANYOF_ALPHA; + break; + case 'e': + if (memEQ(name_start, "spac", 4)) /* space */ + class_number = ANYOF_SPACE; + break; + case 'h': + if (memEQ(name_start, "grap", 4)) /* graph */ + class_number = ANYOF_GRAPH; + break; + case 'i': + if (memEQ(name_start, "asci", 4)) /* ascii */ + class_number = ANYOF_ASCII; + break; + case 'k': + if (memEQ(name_start, "blan", 4)) /* blank */ + class_number = ANYOF_BLANK; + break; + case 'l': + if (memEQ(name_start, "cntr", 4)) /* cntrl */ + class_number = ANYOF_CNTRL; + break; + case 'm': + if (memEQ(name_start, "alnu", 4)) /* alnum */ + class_number = ANYOF_ALPHANUMERIC; + break; + case 'r': + if (memEQ(name_start, "lowe", 4)) /* lower */ + class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; + else if (memEQ(name_start, "uppe", 4)) /* upper */ + class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; + break; + case 't': + if (memEQ(name_start, "digi", 4)) /* digit */ + class_number = ANYOF_DIGIT; + else if (memEQ(name_start, "prin", 4)) /* print */ + class_number = ANYOF_PRINT; + else if (memEQ(name_start, "punc", 4)) /* punct */ + class_number = ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(name_start, "xdigit", 6)) + class_number = ANYOF_XDIGIT; + break; + } + + /* If the name exactly matches a posix class name the class number will + * here be set to it, and the input almost certainly was meant to be a + * posix class, so we can skip further checking. If instead the syntax + * is exactly correct, but the name isn't one of the legal ones, we + * will return that as an error below. But if neither of these apply, + * it could be that no posix class was intended at all, or that one + * was, but there was a typo. We tease these apart by doing fuzzy + * matching on the name */ + if (class_number == OOB_NAMEDCLASS && found_problem) { + const UV posix_names[][6] = { + { 'a', 'l', 'n', 'u', 'm' }, + { 'a', 'l', 'p', 'h', 'a' }, + { 'a', 's', 'c', 'i', 'i' }, + { 'b', 'l', 'a', 'n', 'k' }, + { 'c', 'n', 't', 'r', 'l' }, + { 'd', 'i', 'g', 'i', 't' }, + { 'g', 'r', 'a', 'p', 'h' }, + { 'l', 'o', 'w', 'e', 'r' }, + { 'p', 'r', 'i', 'n', 't' }, + { 'p', 'u', 'n', 'c', 't' }, + { 's', 'p', 'a', 'c', 'e' }, + { 'u', 'p', 'p', 'e', 'r' }, + { 'w', 'o', 'r', 'd' }, + { 'x', 'd', 'i', 'g', 'i', 't' } + }; + /* The names of the above all have added NULs to make them the same + * size, so we need to also have the real lengths */ + const UV posix_name_lengths[] = { + sizeof("alnum") - 1, + sizeof("alpha") - 1, + sizeof("ascii") - 1, + sizeof("blank") - 1, + sizeof("cntrl") - 1, + sizeof("digit") - 1, + sizeof("graph") - 1, + sizeof("lower") - 1, + sizeof("print") - 1, + sizeof("punct") - 1, + sizeof("space") - 1, + sizeof("upper") - 1, + sizeof("word") - 1, + sizeof("xdigit")- 1 + }; + unsigned int i; + int temp_max = max_distance; /* Use a temporary, so if we + reparse, we haven't changed the + outer one */ + + /* Use a smaller max edit distance if we are missing one of the + * delimiters */ + if ( has_opening_bracket + has_opening_colon < 2 + || has_terminating_bracket + has_terminating_colon < 2) + { + temp_max--; + } + + /* See if the input name is close to a legal one */ + for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) { + + /* Short circuit call if the lengths are too far apart to be + * able to match */ + if (abs( (int) (name_len - posix_name_lengths[i])) + > temp_max) + { + continue; + } + + if (edit_distance(input_text, + posix_names[i], + name_len, + posix_name_lengths[i], + temp_max + ) + > -1) + { /* If it is close, it probably was intended to be a class */ + goto probably_meant_to_be; + } + } + + /* Here the input name is not close enough to a valid class name + * for us to consider it to be intended to be a posix class. If + * we haven't already done so, and the parse found a character that + * could have been terminators for the name, but which we absorbed + * as typos during the first pass, repeat the parse, signalling it + * to stop at that character */ + if (possible_end && possible_end != (char *) -1) { + possible_end = (char *) -1; + p = name_start; + goto parse_name; + } + + /* Here neither pass found a close-enough class name */ + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + probably_meant_to_be: + + /* Here we think that a posix specification was intended. Update any + * parse pointer */ + if (updated_parse_ptr) { + *updated_parse_ptr = (char *) p; + } + + /* If a posix class name was intended but incorrectly specified, we + * output or return the warnings */ + if (found_problem) { + + /* We set flags for these issues in the parse loop above instead of + * adding them to the list of warnings, because we can parse it + * twice, and we only want one warning instance */ + if (has_upper) { + ADD_POSIX_WARNING(p, "the name must be all lowercase letters"); + } + if (has_blank) { + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); + } + if (has_semi_colon) { + ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); + } + else if (! has_terminating_colon) { + ADD_POSIX_WARNING(p, "there is no terminating ':'"); + } + if (! has_terminating_bracket) { + ADD_POSIX_WARNING(p, "there is no terminating ']'"); + } + + if (warn_text) { + if (posix_warnings) { + /* mortalize to avoid a leak with FATAL warnings */ + *posix_warnings = (AV *) sv_2mortal((SV *) warn_text); + } + else { + SvREFCNT_dec_NN(warn_text); + } + } + } + else if (class_number != OOB_NAMEDCLASS) { + /* If it is a known class, return the class. The class number + * #defines are structured so each complement is +1 to the normal + * one */ + return class_number + complement; + } + else if (! check_only) { + + /* Here, it is an unrecognized class. This is an error (unless the + * call is to check only, which we've already handled above) */ + const char * const complement_string = (complement) + ? "^" + : ""; + RExC_parse = (char *) p; + vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown", + complement_string, + UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); + } + } - return (p - && p - RExC_parse > 2 /* [:] evaluates to colon; - [::] is a bad posix class. */ - && first_char == *(p - 1)); + return OOB_NAMEDCLASS; } +#undef ADD_POSIX_WARNING STATIC unsigned int S_regex_set_precedence(const U8 my_operator) { @@ -13398,6 +14477,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, const bool save_fold = FOLD; /* Temporary */ char *save_end, *save_parse; /* Temporaries */ const bool in_locale = LOC; /* we turn off /l during processing */ + AV* posix_warnings = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -13435,27 +14515,27 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, default: break; case '\\': - /* Skip the next byte (which could cause us to end up in - * the middle of a UTF-8 character, but since none of those - * are confusable with anything we currently handle in this - * switch (invariants all), it's safe. We'll just hit the - * default: case next time and keep on incrementing until - * we find one of the invariants we do handle. */ + /* Skip past this, so the next character gets skipped, after + * the switch */ RExC_parse++; if (*RExC_parse == 'c') { /* Skip the \cX notation for control characters */ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; } break; + case '[': { - /* If this looks like it is a [:posix:] class, leave the - * parse pointer at the '[' to fool regclass() into - * thinking it is part of a '[[:posix:]]'. That function - * will use strict checking to force a syntax error if it - * doesn't work out to a legitimate class */ - bool is_posix_class - = could_it_be_a_POSIX_class(pRExC_state); + /* See if this is a [:posix:] class. */ + bool is_posix_class = (OOB_NAMEDCLASS + < handle_possible_posix(pRExC_state, + RExC_parse + 1, + NULL, + NULL, + TRUE /* checking only */)); + /* If it is a posix class, leave the parse pointer at the + * '[' to fool regclass() into thinking it is part of a + * '[[:posix:]]'. */ if (! is_posix_class) { RExC_parse++; } @@ -13470,7 +14550,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, TRUE, /* silence non-portable warnings. */ TRUE, /* strict */ FALSE, /* Require return to be an ANYOF */ - ¤t + ¤t, + &posix_warnings )) FAIL2("panic: regclass returned NULL to handle_sets, " "flags=%#"UVxf"", (UV) *flagp); @@ -13488,9 +14569,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, case ']': if (depth--) break; RExC_parse++; - if (RExC_parse < RExC_end - && *RExC_parse == ')') - { + if (*RExC_parse == ')') { node = reganode(pRExC_state, ANYOF, 0); RExC_size += ANYOF_SKIP; nextchar(pRExC_state); @@ -13509,6 +14588,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } no_close: + /* We output the messages even if warnings are off, because we'll fail + * the very next thing, and these give a likely diagnosis for that */ + if (posix_warnings && av_tindex(posix_warnings) >= 0) { + output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); + } + FAIL("Syntax error in (?[...])"); } @@ -13516,10 +14601,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), - UTF8fARG(UTF, - RExC_end - RExC_start - (RExC_parse - RExC_precomp), - RExC_precomp + (RExC_parse - RExC_precomp))); + REPORT_LOCATION_ARGS(RExC_parse)); /* Everything in this construct is a metacharacter. Operands begin with * either a '\' (for an escape sequence), or a '[' for a bracketed @@ -13629,7 +14711,8 @@ redo_curchar: case '(': - if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?')) + if ( RExC_parse < RExC_end - 1 + && (UCHARAT(RExC_parse + 1) == '?')) { /* If is a '(?', could be an embedded '(?flags:(?[...])'. * This happens when we have some thing like @@ -13685,12 +14768,12 @@ redo_curchar: * inversion list, and RExC_parse points to the trailing * ']'; the next character should be the ')' */ RExC_parse++; - assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')'); + assert(UCHARAT(RExC_parse) == ')'); /* Then the ')' matching the original '(' handled by this * case: statement */ RExC_parse++; - assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')'); + assert(UCHARAT(RExC_parse) == ')'); RExC_parse++; RExC_flags = save_flags; @@ -13730,7 +14813,8 @@ redo_curchar: FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ FALSE, /* Require return to be an ANYOF */ - ¤t)) + ¤t, + NULL)) { FAIL2("panic: regclass returned NULL to handle_sets, " "flags=%#"UVxf"", (UV) *flagp); @@ -13743,23 +14827,33 @@ redo_curchar: case '[': /* Is a bracketed character class */ { - bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); - + /* See if this is a [:posix:] class. */ + bool is_posix_class = (OOB_NAMEDCLASS + < handle_possible_posix(pRExC_state, + RExC_parse + 1, + NULL, + NULL, + TRUE /* checking only */)); + /* If it is a posix class, leave the parse pointer at the '[' + * to fool regclass() into thinking it is part of a + * '[[:posix:]]'. */ if (! is_posix_class) { RExC_parse++; } /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if * multi-char folds are allowed. */ - if(!regclass(pRExC_state, flagp,depth+1, - is_posix_class, /* parse the whole char class - only if not a posix class */ - FALSE, /* don't allow multi-char folds */ - FALSE, /* don't silence non-portable warnings. */ - TRUE, /* strict */ - FALSE, /* Require return to be an ANYOF */ - ¤t - )) + if (!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char + class only if not a + posix class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. */ + TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ + ¤t, + NULL + )) { FAIL2("panic: regclass returned NULL to handle_sets, " "flags=%#"UVxf"", (UV) *flagp); @@ -13900,7 +14994,13 @@ redo_curchar: } lhs = av_pop(stack); - assert(IS_OPERAND(lhs)); + + if (! IS_OPERAND(lhs)) { + + /* This can happen when there is an empty (), like in + * /(?[[0]+()+])/ */ + goto bad_syntax; + } switch (stacked_operator) { case '&': @@ -13946,9 +15046,20 @@ redo_curchar: av_push(stack, rhs); goto redo_curchar; - case '!': /* Highest priority, right associative, so just push - onto stack */ - av_push(stack, newSVuv(curchar)); + case '!': /* Highest priority, right associative */ + + /* If what's already at the top of the stack is another '!", + * they just cancel each other out */ + if ( (top_ptr = av_fetch(stack, top_index, FALSE)) + && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) + { + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + } + else { /* Otherwise, since it's right associative, just push + onto the stack */ + av_push(stack, newSVuv(curchar)); + } break; default: @@ -14070,6 +15181,7 @@ redo_curchar: they're valid on this machine */ FALSE, /* similarly, no need for strict */ FALSE, /* Require return to be an ANYOF */ + NULL, NULL ); if (!node) @@ -14098,7 +15210,8 @@ redo_curchar: assert(OP(node) == ANYOF); OP(node) = ANYOFL; - ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8; + ANYOF_FLAGS(node) + |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } if (save_fold) { @@ -14195,6 +15308,43 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl } } +STATIC void +S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings) +{ + /* If the final parameter is NULL, output the elements of the array given + * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are + * pushed onto it, (creating if necessary) */ + + SV * msg; + const bool first_is_fatal = ! return_posix_warnings + && ckDEAD(packWARN(WARN_REGEXP)); + + PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS; + + while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { + if (return_posix_warnings) { + if (! *return_posix_warnings) { /* mortalize to not leak if + warnings are fatal */ + *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV()); + } + av_push(*return_posix_warnings, msg); + } + else { + if (first_is_fatal) { /* Avoid leaking this */ + av_undef(posix_warnings); /* This isn't necessary if the + array is mortal, but is a + fail-safe */ + (void) sv_2mortal(msg); + if (PASS2) { + SAVEFREESV(RExC_rx_sv); + } + } + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); + SvREFCNT_dec_NN(msg); + } + } +} + STATIC AV * S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) { @@ -14257,8 +15407,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \ STMT_START { \ if (do_skip) { \ - while ( p < RExC_end \ - && isBLANK_A(UCHARAT(p))) \ + while (isBLANK_A(UCHARAT(p))) \ { \ p++; \ } \ @@ -14276,7 +15425,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool strict, bool optimizable, /* ? Allow a non-ANYOF return node */ - SV** ret_invlist /* Return an inversion list, not a node */ + SV** ret_invlist, /* Return an inversion list, not a node */ + AV** return_posix_warnings ) { /* parse a bracketed class specification. Most of these will produce an @@ -14310,7 +15460,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, UV value = OOB_UNICODE, save_value = OOB_UNICODE; regnode *ret; STRLEN numlen; - IV namedclass = OOB_NAMEDCLASS; + int namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; bool need_class = 0; SV *listsv = NULL; @@ -14353,8 +15503,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, bool has_user_defined_property = FALSE; /* inversion list of code points this node matches only when the target - * string is in UTF-8. (Because is under /d) */ - SV* depends_list = NULL; + * string is in UTF-8. These are all non-ASCII, < 256. (Because is under + * /d) */ + SV* has_upper_latin1_only_utf8_matches = NULL; /* Inversion list of code points this node matches regardless of things * like locale, folding, utf8ness of the target string */ @@ -14388,6 +15539,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const char * orig_parse = RExC_parse; const SSize_t orig_size = RExC_size; bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ + + /* This variable is used to mark where the end in the input is of something + * that looks like a POSIX construct but isn't. During the parse, when + * something looks like it could be such a construct is encountered, it is + * checked for being one, but not if we've already checked this area of the + * input. Only after this position is reached do we check again */ + char *not_posix_region_end = RExC_parse - 1; + + AV* posix_warnings = NULL; + const bool do_posix_warnings = return_posix_warnings + || (PASS2 && ckWARN(WARN_REGEXP)); + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -14407,9 +15570,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = reganode(pRExC_state, (LOC) ? ANYOFL - : (DEPENDS_SEMANTICS) - ? ANYOFD - : ANYOF, + : ANYOF, 0); if (SIZE_ONLY) { @@ -14427,7 +15588,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); - if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + assert(RExC_parse <= RExC_end); + + if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ RExC_parse++; invert = TRUE; allow_multi_folds = FALSE; @@ -14436,22 +15599,25 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* Check that they didn't say [:posix:] instead of [[:posix:]] */ - if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { - const char *s = RExC_parse; - const char c = *s++; - - if (*s == '^') { - s++; + if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { + int maybe_class = handle_possible_posix(pRExC_state, + RExC_parse, + ¬_posix_region_end, + NULL, + TRUE /* checking only */); + if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { + SAVEFREESV(RExC_rx_sv); + ckWARN4reg(not_posix_region_end, + "POSIX syntax [%c %c] belongs inside character classes%s", + *RExC_parse, *RExC_parse, + (maybe_class == OOB_NAMEDCLASS) + ? ((POSIXCC_NOTYET(*RExC_parse)) + ? " (but this one isn't implemented)" + : " (but this one isn't fully valid)") + : "" + ); + (void)ReREFCNT_inc(RExC_rx_sv); } - while (isWORDCHAR(*s)) - s++; - if (*s && c == *s && s[1] == ']') { - SAVEFREESV(RExC_rx_sv); - ckWARN3reg(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); - (void)ReREFCNT_inc(RExC_rx_sv); - } } /* If the caller wants us to just parse a single element, accomplish this @@ -14465,6 +15631,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, goto charclassloop; while (1) { + + if ( posix_warnings + && av_tindex(posix_warnings) >= 0 + && RExC_parse > not_posix_region_end) + { + /* Warnings about posix class issues are considered tentative until + * we are far enough along in the parse that we can no longer + * change our mind, at which point we either output them or add + * them, if it has so specified, to what gets returned to the + * caller. This is done each time through the loop so that a later + * class won't zap them before they have been dealt with. */ + output_or_return_posix_warnings(pRExC_state, posix_warnings, + return_posix_warnings); + } + if (RExC_parse >= stop_ptr) { break; } @@ -14486,7 +15667,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, element_count++; non_portable_endpoint = 0; } - if (UTF) { + if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT); @@ -14495,14 +15676,53 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else value = UCHARAT(RExC_parse++); - if (value == '[' - && RExC_parse < RExC_end - && POSIXCC(UCHARAT(RExC_parse))) - { - namedclass = regpposixcc(pRExC_state, value, strict); - } - else if (value == '\\') { + if (value == '[') { + char * posix_class_end; + namedclass = handle_possible_posix(pRExC_state, + RExC_parse, + &posix_class_end, + do_posix_warnings ? &posix_warnings : NULL, + FALSE /* die if error */); + if (namedclass > OOB_NAMEDCLASS) { + + /* If there was an earlier attempt to parse this particular + * posix class, and it failed, it was a false alarm, as this + * successful one proves */ + if ( posix_warnings + && av_tindex(posix_warnings) >= 0 + && not_posix_region_end >= RExC_parse + && not_posix_region_end <= posix_class_end) + { + av_undef(posix_warnings); + } + + RExC_parse = posix_class_end; + } + else if (namedclass == OOB_NAMEDCLASS) { + not_posix_region_end = posix_class_end; + } + else { + namedclass = OOB_NAMEDCLASS; + } + } + else if ( RExC_parse - 1 > not_posix_region_end + && MAYBE_POSIXCC(value)) + { + (void) handle_possible_posix( + pRExC_state, + RExC_parse - 1, /* -1 because parse has already been + advanced */ + ¬_posix_region_end, + do_posix_warnings ? &posix_warnings : NULL, + TRUE /* checking only */); + } + else if (value == '\\') { /* Is a backslash; get the code point of the char after it */ + + if (RExC_parse >= RExC_end) { + vFAIL("Unmatched ["); + } + if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, @@ -14544,6 +15764,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &value, /* Yes single value */ &cp_count, /* Multiple code pt count */ flagp, + strict, depth) ) { @@ -14556,11 +15777,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, vFAIL("\\N in a character class must be a named character: \\N{...}"); } else if (cp_count == 0) { - if (strict) { - RExC_parse++; /* Position after the "}" */ - vFAIL("Zero length \\N{}"); - } - else if (PASS2) { + if (PASS2) { ckWARNreg(RExC_parse, "Ignoring zero length \\N{} in character class"); } @@ -14616,7 +15833,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) - vFAIL2("Empty \\%c{}", (U8)value); + vFAIL2("Empty \\%c", (U8)value); if (*RExC_parse == '{') { const U8 c = (U8)value; e = strchr(RExC_parse, '}'); @@ -14664,6 +15881,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SV* invlist; char* name; char* base_name; /* name after any packages are stripped */ + char* lookup_name = NULL; const char * const colon_colon = "::"; /* Try to get the definition of the property into @@ -14671,25 +15889,28 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - name = savepv(Perl_form(aTHX_ - "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - )); + name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse)); + SAVEFREEPV(name); + if (FOLD) { + lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name)); + } /* Look up the property name, and get its swash and * inversion list, if the property is found */ - if (swash) { /* Return any left-overs */ - SvREFCNT_dec_NN(swash); - } - swash = _core_swash_init("utf8", name, &PL_sv_undef, + SvREFCNT_dec(swash); /* Free any left-overs */ + swash = _core_swash_init("utf8", + (lookup_name) + ? lookup_name + : name, + &PL_sv_undef, 1, /* binary */ 0, /* not tr/// */ NULL, /* No inversion list */ &swash_init_flags ); + if (lookup_name) { + Safefree(lookup_name); + } if (! swash || ! (invlist = _get_swash_invlist(swash))) { HV* curpkg = (IN_PERL_COMPILETIME) ? PL_curstash @@ -14752,26 +15973,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, pkgname, name); n = strlen(full_name); - Safefree(name); name = savepvn(full_name, n); + SAVEFREEPV(name); } } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n", (value == 'p' ? '+' : '!'), - UTF8fARG(UTF, n, name)); + (FOLD) ? "__" : "", + UTF8fARG(UTF, n, name), + (FOLD) ? "_i" : ""); has_user_defined_property = TRUE; optimizable = FALSE; /* Will have to leave this an ANYOF node */ - /* We don't know yet, so have to assume that the - * property could match something in the upper Latin1 - * range, hence something that isn't utf8. Note that - * this would cause things in to match - * inappropriately, except that any \p{}, including - * this one forces Unicode semantics, which means there - * is no */ - ANYOF_FLAGS(ret) - |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES; + /* We don't know yet what this matches, so have to flag + * it */ + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } else { @@ -14812,7 +16029,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, _invlist_union(properties, invlist, &properties); } } - Safefree(name); } RExC_parse = e + 1; namedclass = ANYOF_UNIPROP; /* no official name, but it's @@ -14907,7 +16123,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, recode_encoding: if (! RExC_override_recoding) { SV* enc = _get_encoding(); - value = reg_recode((const char)(U8)value, &enc); + value = reg_recode((U8)value, &enc); if (!enc) { if (strict) { vFAIL("Invalid escape in the specified encoding"); @@ -15365,7 +16581,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, "\"%.*s\" is more clearly written simply as \"%s\"", (int) (RExC_parse - rangebegin), rangebegin, - cntrl_to_mnemonic((char) value) + cntrl_to_mnemonic((U8) value) ); } } @@ -15418,6 +16634,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, range = 0; /* this range (if it was one) is done now */ } /* End of loop through all the text within the brackets */ + + if ( posix_warnings && av_tindex(posix_warnings) >= 0) { + output_or_return_posix_warnings(pRExC_state, posix_warnings, + return_posix_warnings); + } + /* If anything in the class expands to more than one character, we have to * deal with them by building up a substitute parse string, and recursively * calling reg() on it, instead of proceeding */ @@ -15427,11 +16649,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN len; char *save_end = RExC_end; char *save_parse = RExC_parse; + char *save_start = RExC_start; + STRLEN prefix_end = 0; /* We copy the character class after a + prefix supplied here. This is the size + + 1 of that prefix */ bool first_time = TRUE; /* First multi-char occurrence doesn't get a "|" */ I32 reg_flags; assert(! invert); + assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */ + #if 0 /* Have decided not to deal with multi-char folds in inverted classes, because too confusing */ if (invert) { @@ -15465,8 +16693,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * multi-character folds, have to include it in recursive parsing */ if (element_count) { sv_catpv(substitute_parse, "|["); + prefix_end = SvCUR(substitute_parse); sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); - sv_catpv(substitute_parse, "]"); + + /* Put in a closing ']' only if not going off the end, as otherwise + * we are adding something that really isn't there */ + if (RExC_parse < RExC_end) { + sv_catpv(substitute_parse, "]"); + } } sv_catpv(substitute_parse, ")"); @@ -15479,7 +16713,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } #endif - RExC_parse = SvPV(substitute_parse, len); + /* Set up the data structure so that any errors will be properly + * reported. See the comments at the definition of + * REPORT_LOCATION_ARGS for details */ + RExC_precomp_adj = orig_parse - RExC_precomp; + RExC_start = RExC_parse = SvPV(substitute_parse, len); + RExC_adjusted_start = RExC_start + prefix_end; RExC_end = RExC_parse + len; RExC_in_multi_char_class = 1; RExC_override_recoding = 1; @@ -15489,7 +16728,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8); - RExC_parse = save_parse; + /* And restore so can parse the rest of the pattern */ + RExC_parse = save_parse; + RExC_start = RExC_adjusted_start = save_start; + RExC_precomp_adj = 0; RExC_end = save_end; RExC_in_multi_char_class = 0; RExC_override_recoding = 0; @@ -15518,9 +16760,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (UNLIKELY(posixl_matches_all)) { op = SANY; } - else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like - \w or [:digit:] or \p{foo} - */ + else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named + class, like \w or [:digit:] + or \p{foo} */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -15769,9 +17011,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, PL_fold_latin1[j]); } else { - depends_list = - add_cp_to_invlist(depends_list, - PL_fold_latin1[j]); + has_upper_latin1_only_utf8_matches + = add_cp_to_invlist( + has_upper_latin1_only_utf8_matches, + PL_fold_latin1[j]); } } @@ -15835,8 +17078,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else { /* Similarly folds involving non-ascii Latin1 * characters under /d are added to their list */ - depends_list = add_cp_to_invlist(depends_list, - c); + has_upper_latin1_only_utf8_matches + = add_cp_to_invlist( + has_upper_latin1_only_utf8_matches, + c); } } } @@ -15912,13 +17157,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, cp_list = posixes; } - if (depends_list) { - _invlist_union(depends_list, nonascii_but_latin1_properties, - &depends_list); + if (has_upper_latin1_only_utf8_matches) { + _invlist_union(has_upper_latin1_only_utf8_matches, + nonascii_but_latin1_properties, + &has_upper_latin1_only_utf8_matches); SvREFCNT_dec_NN(nonascii_but_latin1_properties); } else { - depends_list = nonascii_but_latin1_properties; + has_upper_latin1_only_utf8_matches + = nonascii_but_latin1_properties; } } } @@ -15932,15 +17179,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * class that isn't a Unicode property, and which matches above Unicode, \W * or [\x{110000}] for example. * (Note that in this case, unlike the Posix one above, there is no - * , because having a Unicode property forces Unicode - * semantics */ + * , because having a Unicode property + * forces Unicode semantics */ if (properties) { if (cp_list) { /* If it matters to the final outcome, see if a non-property * component of the class matches above Unicode. If so, the * warning gets suppressed. This is true even if just a single - * such code point is specified, as though not strictly correct if + * such code point is specified, as, though not strictly correct if * another such code point is matched against, the fact that they * are using above-Unicode code points indicates they should know * the issues involved */ @@ -15981,27 +17228,115 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * fetching). We know to set the flag if we have a non-NULL list for UTF-8 * locales, or the class matches at least one 0-255 range code point */ if (LOC && FOLD) { + + /* Some things on the list might be unconditionally included because of + * other components. Remove them, and clean up the list if it goes to + * 0 elements */ + if (only_utf8_locale_list && cp_list) { + _invlist_subtract(only_utf8_locale_list, cp_list, + &only_utf8_locale_list); + + if (_invlist_len(only_utf8_locale_list) == 0) { + SvREFCNT_dec_NN(only_utf8_locale_list); + only_utf8_locale_list = NULL; + } + } if (only_utf8_locale_list) { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + ANYOF_FLAGS(ret) + |= ANYOFL_FOLD + |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } else if (cp_list) { /* Look to see if a 0-255 code point is in list */ UV start, end; invlist_iterinit(cp_list); if (invlist_iternext(cp_list, &start, &end) && start < 256) { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + ANYOF_FLAGS(ret) |= ANYOFL_FOLD; } invlist_iterfinish(cp_list); } } +#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \ + ( DEPENDS_SEMANTICS \ + && (ANYOF_FLAGS(ret) \ + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + + /* See if we can simplify things under /d */ + if ( has_upper_latin1_only_utf8_matches + || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) + { + /* But not if we are inverting, as that screws it up */ + if (! invert) { + if (has_upper_latin1_only_utf8_matches) { + if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { + + /* Here, we have both the flag and inversion list. Any + * character in 'has_upper_latin1_only_utf8_matches' + * matches when UTF-8 is in effect, but it also matches + * when UTF-8 is not in effect because of + * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches + * unconditionally, so can be added to the regular list, + * and 'has_upper_latin1_only_utf8_matches' cleared */ + _invlist_union(cp_list, + has_upper_latin1_only_utf8_matches, + &cp_list); + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + has_upper_latin1_only_utf8_matches = NULL; + } + else if (cp_list) { + + /* Here, 'cp_list' gives chars that always match, and + * 'has_upper_latin1_only_utf8_matches' gives chars that + * were specified to match only if the target string is in + * UTF-8. It may be that these overlap, so we can subtract + * the unconditionally matching from the conditional ones, + * to make the conditional list as small as possible, + * perhaps even clearing it, in which case more + * optimizations are possible later */ + _invlist_subtract(has_upper_latin1_only_utf8_matches, + cp_list, + &has_upper_latin1_only_utf8_matches); + if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) { + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + has_upper_latin1_only_utf8_matches = NULL; + } + } + } + + /* Similarly, if the unconditional matches include every upper + * latin1 character, we can clear that flag to permit later + * optimizations */ + if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { + SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1); + _invlist_subtract(only_non_utf8_list, cp_list, + &only_non_utf8_list); + if (_invlist_len(only_non_utf8_list) == 0) { + ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + } + SvREFCNT_dec_NN(only_non_utf8_list); + only_non_utf8_list = NULL;; + } + } + + /* If we haven't gotten rid of all conditional matching, we change the + * regnode type to indicate that */ + if ( has_upper_latin1_only_utf8_matches + || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) + { + OP(ret) = ANYOFD; + optimizable = FALSE; + } + } +#undef MATCHES_ALL_NON_UTF8_NON_ASCII + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ if (cp_list && invert + && OP(ret) != ANYOFD && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) - && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { _invlist_invert(cp_list); @@ -16043,16 +17378,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * adjacent such nodes. And if the class is equivalent to things like /./, * expensive run-time swashes can be avoided. Now that we have more * complete information, we can find things necessarily missed by the - * earlier code. I (khw) did some benchmarks and found essentially no - * speed difference between using a POSIXA node versus an ANYOF node, so - * there is no reason to optimize, for example [A-Za-z0-9_] into - * [[:word:]]/a (although if we did it in the sizing pass it would save - * space). _invlistEQ() could be used if one ever wanted to do something - * like this at this point in the code */ - - if (optimizable && cp_list && ! invert && ! depends_list) { + * earlier code. Another possible "optimization" that isn't done is that + * something like [Ee] could be changed into an EXACTFU. khw tried this + * and found that the ANYOF is faster, including for code points not in the + * bitmap. This still might make sense to do, provided it got joined with + * an adjacent node(s) to create a longer EXACTFU one. This could be + * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join + * routine would know is joinable. If that didn't happen, the node type + * could then be made a straight ANYOF */ + + if (optimizable && cp_list && ! invert) { UV start, end; U8 op = END; /* The optimzation node-type */ + int posix_class = -1; /* Illegal value */ const char * cur_parse= RExC_parse; invlist_iterinit(cp_list); @@ -16135,6 +17473,49 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } invlist_iterfinish(cp_list); + if (op == END) { + const UV cp_list_len = _invlist_len(cp_list); + const UV* cp_list_array = invlist_array(cp_list); + + /* Here, didn't find an optimization. See if this matches any of + * the POSIX classes. These run slightly faster for above-Unicode + * code points, so don't bother with POSIXA ones nor the 2 that + * have no above-Unicode matches. We can avoid these checks unless + * the ANYOF matches at least as high as the lowest POSIX one + * (which was manually found to be \v. The actual code point may + * increase in later Unicode releases, if a higher code point is + * assigned to be \v, but this code will never break. It would + * just mean we could execute the checks for posix optimizations + * unnecessarily) */ + + if (cp_list_array[cp_list_len-1] > 0x2029) { + for (posix_class = 0; + posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; + posix_class++) + { + int try_inverted; + if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) { + continue; + } + for (try_inverted = 0; try_inverted < 2; try_inverted++) { + + /* Check if matches normal or inverted */ + if (_invlistEQ(cp_list, + PL_XPosix_ptrs[posix_class], + try_inverted)) + { + op = (try_inverted) + ? NPOSIXU + : POSIXU; + *flagp |= HASWIDTH|SIMPLE; + goto found_posix; + } + } + } + found_posix: ; + } + } + if (op != END) { RExC_parse = (char *)orig_parse; RExC_emit = (regnode *)orig_emit; @@ -16152,6 +17533,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, TRUE /* downgradable to EXACT */ ); } + else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + FLAGS(ret) = posix_class; + } SvREFCNT_dec_NN(cp_list); return ret; @@ -16172,16 +17556,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Here, the bitmap has been populated with all the Latin1 code points that * always match. Can now add to the overall list those that match only - * when the target string is UTF-8 (). */ - if (depends_list) { + * when the target string is UTF-8 (). + * */ + if (has_upper_latin1_only_utf8_matches) { if (cp_list) { - _invlist_union(cp_list, depends_list, &cp_list); - SvREFCNT_dec_NN(depends_list); + _invlist_union(cp_list, + has_upper_latin1_only_utf8_matches, + &cp_list); + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); } else { - cp_list = depends_list; + cp_list = has_upper_latin1_only_utf8_matches; } - ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES; + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } /* If there is a swash and more than one element, we can't use the swash in @@ -16249,18 +17636,13 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { assert(! (ANYOF_FLAGS(node) - & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES - |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES))); + & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)); ARG_SET(node, ANYOF_ONLY_HAS_BITMAP); } else { AV * const av = newAV(); SV *rv; - assert(ANYOF_FLAGS(node) - & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES - |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD)); - av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { @@ -16297,7 +17679,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, - SV* exclude_list) + SV** output_invlist) { /* For internal core use only. @@ -16309,24 +17691,32 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, * swash exists, by calling this function with 'doinit' set to false, in * which case the components that will be used to eventually create the * swash are returned (in a printable form). - * If is not NULL, it is an inversion list of things to - * exclude from what's returned in . + * If is not NULL, it is where this routine is to + * store an inversion list of code points that should match only if the + * execution-time locale is a UTF-8 one. + * If is not NULL, it is where this routine is to store an + * inversion list of the code points that would be instead returned in + * if this were NULL. Thus, what gets output in + * when this parameter is used, is just the non-code point data that + * will go into creating the swash. This currently should be just + * user-defined properties whose definitions were not known at compile + * time. Using this parameter allows for easier manipulation of the + * swash's data by the caller. It is illegal to call this function with + * this parameter set, but not + * * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note * that, in spite of this function's name, the swash it returns may include * the bitmap data as well */ SV *sw = NULL; SV *si = NULL; /* Input swash initialization string */ - SV* invlist = NULL; + SV* invlist = NULL; RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - - assert(ANYOF_FLAGS(node) - & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES - |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD)); + assert(! output_invlist || listsvp); if (data && data->count) { const U32 n = ARG(node); @@ -16339,9 +17729,6 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 3 and 4 are either both present or both absent. [3] is - * any inversion list generated at compile time; [4] indicates if - * that inversion list has any user-defined properties in it. */ if (av_tindex(av) >= 2) { if (only_utf8_locale_ptr && ary[2] @@ -16354,6 +17741,10 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, *only_utf8_locale_ptr = NULL; } + /* Elements 3 and 4 are either both present or both absent. [3] + * is any inversion list generated at compile time; [4] + * indicates if that inversion list has any user-defined + * properties in it. */ if (av_tindex(av) >= 3) { invlist = ary[3]; if (SvUV(ary[4])) { @@ -16387,7 +17778,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, /* If requested, return a printable version of what this swash matches */ if (listsvp) { - SV* matches_string = newSVpvs(""); + SV* matches_string = NULL; /* The swash should be used, if possible, to get the data, as it * contains the resolved data. But this function can be called at @@ -16397,22 +17788,124 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - sv_catsv(matches_string, si); + /* Here, we only have 'si' (and possibly some passed-in data in + * 'invlist', which is handled below) If the caller only wants + * 'si', use that. */ + if (! output_invlist) { + matches_string = newSVsv(si); + } + else { + /* But if the caller wants an inversion list of the node, we + * need to parse 'si' and place as much as possible in the + * desired output inversion list, making 'matches_string' only + * contain the currently unresolvable things */ + const char *si_string = SvPVX(si); + STRLEN remaining = SvCUR(si); + UV prev_cp = 0; + U8 count = 0; + + /* Ignore everything before the first new-line */ + while (*si_string != '\n' && remaining > 0) { + si_string++; + remaining--; + } + assert(remaining > 0); + + si_string++; + remaining--; + + while (remaining > 0) { + + /* The data consists of just strings defining user-defined + * property names, but in prior incarnations, and perhaps + * somehow from pluggable regex engines, it could still + * hold hex code point definitions. Each component of a + * range would be separated by a tab, and each range by a + * new-line. If these are found, instead add them to the + * inversion list */ + I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT + |PERL_SCAN_SILENT_NON_PORTABLE; + STRLEN len = remaining; + UV cp = grok_hex(si_string, &len, &grok_flags, NULL); + + /* If the hex decode routine found something, it should go + * up to the next \n */ + if ( *(si_string + len) == '\n') { + if (count) { /* 2nd code point on line */ + *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); + } + else { + *output_invlist = add_cp_to_invlist(*output_invlist, cp); + } + count = 0; + goto prepare_for_next_iteration; + } + + /* If the hex decode was instead for the lower range limit, + * save it, and go parse the upper range limit */ + if (*(si_string + len) == '\t') { + assert(count == 0); + + prev_cp = cp; + count = 1; + prepare_for_next_iteration: + si_string += len + 1; + remaining -= len + 1; + continue; + } + + /* Here, didn't find a legal hex number. Just add it from + * here to the next \n */ + + remaining -= len; + while (*(si_string + len) != '\n' && remaining > 0) { + remaining--; + len++; + } + if (*(si_string + len) == '\n') { + len++; + remaining--; + } + if (matches_string) { + sv_catpvn(matches_string, si_string, len - 1); + } + else { + matches_string = newSVpvn(si_string, len - 1); + } + si_string += len; + sv_catpvs(matches_string, " "); + } /* end of loop through the text */ + + assert(matches_string); + if (SvCUR(matches_string)) { /* Get rid of trailing blank */ + SvCUR_set(matches_string, SvCUR(matches_string) - 1); + } + } /* end of has an 'si' but no swash */ } - /* Add the inversion list to whatever we have. This may have come from - * the swash, or from an input parameter */ - if (invlist) { - if (exclude_list) { - SV* clone = invlist_clone(invlist); - _invlist_subtract(clone, exclude_list, &clone); - sv_catsv(matches_string, _invlist_contents(clone)); - SvREFCNT_dec_NN(clone); + /* If we have a swash in place, its equivalent inversion list was above + * placed into 'invlist'. If not, this variable may contain a stored + * inversion list which is information beyond what is in 'si' */ + if (invlist) { + + /* Again, if the caller doesn't want the output inversion list, put + * everything in 'matches-string' */ + if (! output_invlist) { + if ( ! matches_string) { + matches_string = newSVpvs("\n"); + } + sv_catsv(matches_string, invlist_contents(invlist, + TRUE /* traditional style */ + )); + } + else if (! *output_invlist) { + *output_invlist = invlist_clone(invlist); } else { - sv_catsv(matches_string, _invlist_contents(invlist)); + _invlist_union(*output_invlist, invlist, output_invlist); } - } + } + *listsvp = matches_string; } @@ -16524,14 +18017,16 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) { PERL_ARGS_ASSERT_NEXTCHAR; - assert( ! UTF - || UTF8_IS_INVARIANT(*RExC_parse) - || UTF8_IS_START(*RExC_parse)); + if (RExC_parse < RExC_end) { + assert( ! UTF + || UTF8_IS_INVARIANT(*RExC_parse) + || UTF8_IS_START(*RExC_parse)); - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; - skip_to_be_ignored_text(pRExC_state, &RExC_parse, - FALSE /* Don't assume /x */ ); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't assume /x */ ); + } } STATIC regnode * @@ -16730,10 +18225,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) - regtail - set the next-pointer at the end of a node chain of p to val. - SEE ALSO: regtail_study */ -/* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, - const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t * pRExC_state, + const regnode * const p, + const regnode * const val, + const U32 depth) { regnode *scan; GET_RE_DEBUG_FLAGS_DECL; @@ -16747,7 +18243,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, return; /* Find last node. */ - scan = p; + scan = (regnode *) p; for (;;) { regnode * const temp = regnext(scan); DEBUG_PARSE_r({ @@ -17043,6 +18539,53 @@ Perl_regdump(pTHX_ const regexp *r) #endif /* DEBUGGING */ } +/* Should be synchronized with ANYOF_ #defines in regcomp.h */ +#ifdef DEBUGGING + +# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \ + || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \ + || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \ + || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \ + || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \ + || _CC_VERTSPACE != 15 +# error Need to adjust order of anyofs[] +# endif +static const char * const anyofs[] = { + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" +}; +#endif + /* - regprop - printable representation of opcode, with run time support */ @@ -17052,49 +18595,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ { #ifdef DEBUGGING int k; - - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { -#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ - || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ - || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ - || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ - || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15 - #error Need to adjust order of anyofs[] -#endif - "\\w", - "\\W", - "\\d", - "\\D", - "[:alpha:]", - "[:^alpha:]", - "[:lower:]", - "[:^lower:]", - "[:upper:]", - "[:^upper:]", - "[:punct:]", - "[:^punct:]", - "[:print:]", - "[:^print:]", - "[:alnum:]", - "[:^alnum:]", - "[:graph:]", - "[:^graph:]", - "[:cased:]", - "[:^cased:]", - "\\s", - "\\S", - "[:blank:]", - "[:^blank:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:ascii:]", - "[:^ascii:]", - "\\v", - "\\V" - }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -17151,10 +18651,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); (void) put_charclass_bitmap_innards(sv, - (IS_ANYOF_TRIE(op)) + ((IS_ANYOF_TRIE(op)) ? ANYOF_BITMAP(o) - : TRIE_BITMAP(trie), - NULL); + : TRIE_BITMAP(trie)), + NULL, + NULL, + NULL + ); sv_catpvs(sv, "]"); } @@ -17237,149 +18740,133 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { const U8 flags = ANYOF_FLAGS(o); - int do_sep = 0; - SV* bitmap_invlist; /* Will hold what the bit map contains */ + bool do_sep = FALSE; /* Do we need to separate various components of + the output? */ + /* Set if there is still an unresolved user-defined property */ + SV *unresolved = NULL; + + /* Things that are ignored except when the runtime locale is UTF-8 */ + SV *only_utf8_locale_invlist = NULL; + + /* Code points that don't fit in the bitmap */ + SV *nonbitmap_invlist = NULL; + /* And things that aren't in the bitmap, but are small enough to be */ + SV* bitmap_range_not_in_bitmap = NULL; if (OP(o) == ANYOFL) { - if (flags & ANYOF_LOC_REQ_UTF8) { - sv_catpvs(sv, "{utf8-loc}"); + if (ANYOFL_UTF8_LOCALE_REQD(flags)) { + sv_catpvs(sv, "{utf8-locale-reqd}"); } - else { - sv_catpvs(sv, "{loc}"); + if (flags & ANYOFL_FOLD) { + sv_catpvs(sv, "{i}"); } } - if (flags & ANYOF_LOC_FOLD) - sv_catpvs(sv, "{i}"); - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (flags & ANYOF_INVERT) - sv_catpvs(sv, "^"); - /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches - * */ - do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o), - &bitmap_invlist); + /* If there is stuff outside the bitmap, get it */ + if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &unresolved, + &only_utf8_locale_invlist, + &nonbitmap_invlist); + /* The non-bitmap data may contain stuff that could fit in the + * bitmap. This could come from a user-defined property being + * finally resolved when this call was done; or much more likely + * because there are matches that require UTF-8 to be valid, and so + * aren't in the bitmap. This is teased apart later */ + _invlist_intersection(nonbitmap_invlist, + PL_InBitmap, + &bitmap_range_not_in_bitmap); + /* Leave just the things that don't fit into the bitmap */ + _invlist_subtract(nonbitmap_invlist, + PL_InBitmap, + &nonbitmap_invlist); + } - /* output any special charclass tests (used entirely under use - * locale) * */ - if (ANYOF_POSIXL_TEST_ANY_SET(o)) { - int i; - for (i = 0; i < ANYOF_POSIXL_MAX; i++) { - if (ANYOF_POSIXL_TEST(o,i)) { - sv_catpv(sv, anyofs[i]); - do_sep = 1; - } - } + /* Obey this flag to add all above-the-bitmap code points */ + if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + NUM_ANYOF_CODE_POINTS, + UV_MAX); } - if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP - |ANYOF_HAS_UTF8_NONBITMAP_MATCHES - |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES - |ANYOF_LOC_FOLD))) - { + /* Ready to start outputting. First, the initial left bracket */ + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + + /* Then all the things that could fit in the bitmap */ + do_sep = put_charclass_bitmap_innards(sv, + ANYOF_BITMAP(o), + bitmap_range_not_in_bitmap, + only_utf8_locale_invlist, + o); + SvREFCNT_dec(bitmap_range_not_in_bitmap); + + /* If there are user-defined properties which haven't been defined yet, + * output them, in a separate [] from the bitmap range stuff */ + if (unresolved) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); - if (flags & ANYOF_INVERT) - /*make sure the invert info is in each */ - sv_catpvs(sv, "^"); } - - if (OP(o) == ANYOFD - && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) - { - sv_catpvs(sv, "{non-utf8-latin1-all}"); + if (flags & ANYOF_INVERT) { + sv_catpvs(sv, "^"); } + sv_catsv(sv, unresolved); + do_sep = TRUE; + SvREFCNT_dec_NN(unresolved); + } - if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) - sv_catpvs(sv, "{above_bitmap_all}"); - - if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { - SV *lv; /* Set if there is something outside the bit map. */ - bool byte_output = FALSE; /* If something has been output */ - SV *only_utf8_locale; - - /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist' - * is used to guarantee that nothing in the bitmap gets - * returned */ - (void) _get_regclass_nonbitmap_data(prog, o, FALSE, - &lv, &only_utf8_locale, - bitmap_invlist); - if (lv && lv != &PL_sv_undef) { - char *s = savesvpv(lv); - char * const origs = s; - - while (*s && *s != '\n') - s++; - - if (*s == '\n') { - const char * const t = ++s; + /* And, finally, add the above-the-bitmap stuff */ + if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { + SV* contents; - if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) { - sv_catpvs(sv, "{outside bitmap}"); - } - else { - sv_catpvs(sv, "{utf8}"); - } + /* See if truncation size is overridden */ + const STRLEN dump_len = (PL_dump_re_max_len) + ? PL_dump_re_max_len + : 256; - if (byte_output) { - sv_catpvs(sv, " "); - } - - while (*s) { - if (*s == '\n') { + /* This is output in a separate [] */ + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + } - /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } - if (s[-1] == ' ') - s[-1] = 0; + /* And, for easy of understanding, it is always output not-shown as + * complemented */ + if (flags & ANYOF_INVERT) { + _invlist_invert(nonbitmap_invlist); + _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); + } - sv_catpv(sv, t); - } + contents = invlist_contents(nonbitmap_invlist, + FALSE /* output suitable for catsv */ + ); - out_dump: + /* If the output is shorter than the permissible maximum, just do it. */ + if (SvCUR(contents) <= dump_len) { + sv_catsv(sv, contents); + } + else { + const char * contents_string = SvPVX(contents); + STRLEN i = dump_len; - Safefree(origs); - SvREFCNT_dec_NN(lv); + /* Otherwise, start at the permissible max and work back to the + * first break possibility */ + while (i > 0 && contents_string[i] != ' ') { + i--; } - - if ((flags & ANYOF_LOC_FOLD) - && only_utf8_locale - && only_utf8_locale != &PL_sv_undef) - { - UV start, end; - int max_entries = 256; - - sv_catpvs(sv, "{utf8 locale}"); - invlist_iterinit(only_utf8_locale); - while (invlist_iternext(only_utf8_locale, - &start, &end)) { - put_range(sv, start, end, FALSE); - max_entries --; - if (max_entries < 0) { - sv_catpvs(sv, "..."); - break; - } - } - invlist_iterfinish(only_utf8_locale); + if (i == 0) { /* Fail-safe. Use the max if we couldn't + find a legal break */ + i = dump_len; } + + sv_catpvn(sv, contents_string, i); + sv_catpvs(sv, "..."); } - } - SvREFCNT_dec(bitmap_invlist); + SvREFCNT_dec_NN(contents); + SvREFCNT_dec_NN(nonbitmap_invlist); + } + /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == POSIXD || k == NPOSIXD) { @@ -17402,9 +18889,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ const char * const bounds[] = { "", /* Traditional */ "{gcb}", + "{lb}", "{sb}", "{wb}" }; + assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); sv_catpv(sv, bounds[FLAGS(o)]); } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -18044,18 +19533,18 @@ S_put_code_point(pTHX_ SV *sv, UV c) } else if (isPRINT(c)) { const char string = (char) c; - if (isBACKSLASHED_PUNCT(c)) + + /* We use {phrase} as metanotation in the class, so also escape literal + * braces */ + if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') sv_catpvs(sv, "\\"); sv_catpvn(sv, &string, 1); } + else if (isMNEMONIC_CNTRL(c)) { + Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); + } else { - const char * const mnemonic = cntrl_to_mnemonic((char) c); - if (mnemonic) { - Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c); - } + Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); } } @@ -18065,8 +19554,15 @@ STATIC void S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) { /* Appends to 'sv' a displayable version of the range of code points from - * 'start' to 'end'. It assumes that only ASCII printables are displayable - * as-is (though some of these will be escaped by put_code_point()). */ + * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls + * that have them, when they occur at the beginning or end of the range. + * It uses hex to output the remaining code points, unless 'allow_literals' + * is true, in which case the printable ASCII ones are output as-is (though + * some of these will be escaped by put_code_point()). + * + * NOTE: This is designed only for printing ranges of code points that fit + * inside an ANYOF bitmap. Higher code points are simply suppressed + */ const unsigned int min_range_count = 3; @@ -18080,7 +19576,7 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) if (end - start < min_range_count) { - /* Individual chars in short ranges */ + /* Output chars individually when they occur in short ranges */ for (; start <= end; start++) { put_code_point(sv, start); } @@ -18089,11 +19585,11 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) /* If permitted by the input options, and there is a possibility that * this range contains a printable literal, look to see if there is - * one. */ + * one. */ if (allow_literals && start <= MAX_PRINT_A) { - /* If the range begin isn't an ASCII printable, effectively split - * the range into two parts: + /* If the character at the beginning of the range isn't an ASCII + * printable, effectively split the range into two parts: * 1) the portion before the first such printable, * 2) the rest * and output them separately. */ @@ -18115,18 +19611,18 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) temp_end = end + 1; } - /* Output the first part of the split range, the part that - * doesn't have printables, with no looking for literals - * (otherwise we would infinitely recurse) */ + /* Output the first part of the split range: the part that + * doesn't have printables, with the parameter set to not look + * for literals (otherwise we would infinitely recurse) */ put_range(sv, start, temp_end - 1, FALSE); /* The 2nd part of the range (if any) starts here. */ start = temp_end; - /* We continue instead of dropping down because even if the 2nd - * part is non-empty, it could be so short that we want to - * output it specially, as tested for at the top of this loop. - * */ + /* We do a continue, instead of dropping down, because even if + * the 2nd part is non-empty, it could be so short that we want + * to output it as individual characters, as tested for at the + * top of this loop. */ continue; } @@ -18191,7 +19687,8 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) temp_end--; } - /* And separately output the range that doesn't have mnemonics */ + /* And separately output the interior range that doesn't start or + * end with mnemonics */ put_range(sv, start, temp_end, FALSE); /* Then output the mnemonic trailing controls */ @@ -18210,10 +19707,10 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) : NUM_ANYOF_CODE_POINTS - 1; #if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) - ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}" + ? "\\x%02"UVXf"-\\x%02"UVXf"" : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; #else - format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}"; + format = "\\x%02"UVXf"-\\x%02"UVXf""; #endif GCC_DIAG_IGNORE(-Wformat-nonliteral); Perl_sv_catpvf(aTHX_ sv, format, start, this_end); @@ -18222,57 +19719,26 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) } } -STATIC bool -S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) +STATIC void +S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) { - /* Appends to 'sv' a displayable version of the innards of the bracketed - * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually - * output anything, and bitmap_invlist, if not NULL, will point to an - * inversion list of what is in the bit map */ + /* Concatenate onto the PV in 'sv' a displayable form of the inversion list + * 'invlist' */ - int i; UV start, end; - unsigned int punct_count = 0; - SV* invlist = NULL; - SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */ bool allow_literals = TRUE; - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; - - invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist; - - /* Worst case is exactly every-other code point is in the list */ - *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); - - /* Convert the bit map to an inversion list, keeping track of how many - * ASCII puncts are set, including an extra amount for the backslashed - * ones. */ - for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (BITMAP_TEST(bitmap, i)) { - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i); - if (isPUNCT_A(i)) { - punct_count++; - if isBACKSLASHED_PUNCT(i) { - punct_count++; - } - } - } - } - - /* Nothing to output */ - if (_invlist_len(*invlist_ptr) == 0) { - SvREFCNT_dec(invlist); - return FALSE; - } + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; /* Generally, it is more readable if printable characters are output as * literals, but if a range (nearly) spans all of them, it's best to output * it as a single range. This code will use a single range if all but 2 - * printables are in it */ - invlist_iterinit(*invlist_ptr); - while (invlist_iternext(*invlist_ptr, &start, &end)) { + * ASCII printables are in it */ + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { - /* If range starts beyond final printable, it doesn't have any in it */ + /* If the range starts beyond the final printable, it doesn't have any + * in it */ if (start > MAX_PRINT_A) { break; } @@ -18293,36 +19759,347 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) break; } } - invlist_iterfinish(*invlist_ptr); - - /* The legibility of the output depends mostly on how many punctuation - * characters are output. There are 32 possible ASCII ones, and some have - * an additional backslash, bringing it to currently 36, so if any more - * than 18 are to be output, we can instead output it as its complement, - * yielding fewer puncts, and making it more legible. But give some weight - * to the fact that outputting it as a complement is less legible than a - * straight output, so don't complement unless we are somewhat over the 18 - * mark */ - if (allow_literals && punct_count > 22) { - sv_catpvs(sv, "^"); - - /* Add everything remaining to the list, so when we invert it just - * below, it will be excluded */ - _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr); - _invlist_invert(*invlist_ptr); - } + invlist_iterfinish(invlist); /* Here we have figured things out. Output each range */ - invlist_iterinit(*invlist_ptr); - while (invlist_iternext(*invlist_ptr, &start, &end)) { + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { if (start >= NUM_ANYOF_CODE_POINTS) { break; } put_range(sv, start, end, allow_literals); } - invlist_iterfinish(*invlist_ptr); + invlist_iterfinish(invlist); - return TRUE; + return; +} + +STATIC SV* +S_put_charclass_bitmap_innards_common(pTHX_ + SV* invlist, /* The bitmap */ + SV* posixes, /* Under /l, things like [:word:], \S */ + SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ + SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ + SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ + const bool invert /* Is the result to be inverted? */ +) +{ + /* Create and return an SV containing a displayable version of the bitmap + * and associated information determined by the input parameters. */ + + SV * output; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; + + if (invert) { + output = newSVpvs("^"); + } + else { + output = newSVpvs(""); + } + + /* First, the code points in the bitmap that are unconditionally there */ + put_charclass_bitmap_innards_invlist(output, invlist); + + /* Traditionally, these have been placed after the main code points */ + if (posixes) { + sv_catsv(output, posixes); + } + + if (only_utf8 && _invlist_len(only_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8); + } + + if (not_utf8 && _invlist_len(not_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, not_utf8); + } + + if (only_utf8_locale && _invlist_len(only_utf8_locale)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8_locale); + + /* This is the only list in this routine that can legally contain code + * points outside the bitmap range. The call just above to + * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so + * output them here. There's about a half-dozen possible, and none in + * contiguous ranges longer than 2 */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + UV start, end; + SV* above_bitmap = NULL; + + _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); + + invlist_iterinit(above_bitmap); + while (invlist_iternext(above_bitmap, &start, &end)) { + UV i; + + for (i = start; i <= end; i++) { + put_code_point(output, i); + } + } + invlist_iterfinish(above_bitmap); + SvREFCNT_dec_NN(above_bitmap); + } + } + + /* If the only thing we output is the '^', clear it */ + if (invert && SvCUR(output) == 1) { + SvCUR_set(output, 0); + } + + return output; +} + +STATIC bool +S_put_charclass_bitmap_innards(pTHX_ SV *sv, + char *bitmap, + SV *nonbitmap_invlist, + SV *only_utf8_locale_invlist, + const regnode * const node) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class defined by the other arguments: + * 'bitmap' points to the bitmap. + * 'nonbitmap_invlist' is an inversion list of the code points that are in + * the bitmap range, but for some reason aren't in the bitmap; NULL if + * none. The reasons for this could be that they require some + * condition such as the target string being or not being in UTF-8 + * (under /d), or because they came from a user-defined property that + * was not resolved at the time of the regex compilation (under /u) + * 'only_utf8_locale_invlist' is an inversion list of the code points that + * are valid only if the runtime locale is a UTF-8 one; NULL if none + * 'node' is the regex pattern node. It is needed only when the above two + * parameters are not null, and is passed so that this routine can + * tease apart the various reasons for them. + * + * It returns TRUE if there was actually something output. (It may be that + * the bitmap, etc is empty.) + * + * When called for outputting the bitmap of a non-ANYOF node, just pass the + * bitmap, with the succeeding parameters set to NULL. + * + */ + + /* In general, it tries to display the 'cleanest' representation of the + * innards, choosing whether to display them inverted or not, regardless of + * whether the class itself is to be inverted. However, there are some + * cases where it can't try inverting, as what actually matches isn't known + * until runtime, and hence the inversion isn't either. */ + bool inverting_allowed = TRUE; + + int i; + STRLEN orig_sv_cur = SvCUR(sv); + + SV* invlist; /* Inversion list we accumulate of code points that + are unconditionally matched */ + SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is + UTF-8 */ + SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 + */ + SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ + SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale + is UTF-8 */ + + SV* as_is_display; /* The output string when we take the inputs + literally */ + SV* inverted_display; /* The output string when we invert the inputs */ + + U8 flags = (node) ? ANYOF_FLAGS(node) : 0; + + bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted + to match? */ + /* We are biased in favor of displaying things without them being inverted, + * as that is generally easier to understand */ + const int bias = 5; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; + + /* Start off with whatever code points are passed in. (We clone, so we + * don't change the caller's list) */ + if (nonbitmap_invlist) { + assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); + invlist = invlist_clone(nonbitmap_invlist); + } + else { /* Worst case size is every other code point is matched */ + invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); + } + + if (flags) { + if (OP(node) == ANYOFD) { + + /* This flag indicates that the code points below 0x100 in the + * nonbitmap list are precisely the ones that match only when the + * target is UTF-8 (they should all be non-ASCII). */ + if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) + { + _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); + _invlist_subtract(invlist, only_utf8, &invlist); + } + + /* And this flag for matching all non-ASCII 0xFF and below */ + if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) + { + if (invert) { + not_utf8 = _new_invlist(0); + } + else { + not_utf8 = invlist_clone(PL_UpperLatin1); + } + inverting_allowed = FALSE; /* XXX needs more work to be able + to allow this */ + } + } + else if (OP(node) == ANYOFL) { + + /* If either of these flags are set, what matches isn't + * determinable except during execution, so don't know enough here + * to invert */ + if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { + inverting_allowed = FALSE; + } + + /* What the posix classes match also varies at runtime, so these + * will be output symbolically. */ + if (ANYOF_POSIXL_TEST_ANY_SET(node)) { + int i; + + posixes = newSVpvs(""); + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(node,i)) { + sv_catpv(posixes, anyofs[i]); + } + } + } + } + } + + /* Accumulate the bit map into the unconditional match list */ + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (BITMAP_TEST(bitmap, i)) { + int start = i++; + for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); + } + } + + /* Make sure that the conditional match lists don't have anything in them + * that match unconditionally; otherwise the output is quite confusing. + * This could happen if the code that populates these misses some + * duplication. */ + if (only_utf8) { + _invlist_subtract(only_utf8, invlist, &only_utf8); + } + if (not_utf8) { + _invlist_subtract(not_utf8, invlist, ¬_utf8); + } + + if (only_utf8_locale_invlist) { + + /* Since this list is passed in, we have to make a copy before + * modifying it */ + only_utf8_locale = invlist_clone(only_utf8_locale_invlist); + + _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); + + /* And, it can get really weird for us to try outputting an inverted + * form of this list when it has things above the bitmap, so don't even + * try */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + inverting_allowed = FALSE; + } + } + + /* Calculate what the output would be if we take the input as-is */ + as_is_display = put_charclass_bitmap_innards_common(invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, + invert); + + /* If have to take the output as-is, just do that */ + if (! inverting_allowed) { + sv_catsv(sv, as_is_display); + } + else { /* But otherwise, create the output again on the inverted input, and + use whichever version is shorter */ + + int inverted_bias, as_is_bias; + + /* We will apply our bias to whichever of the the results doesn't have + * the '^' */ + if (invert) { + invert = FALSE; + as_is_bias = bias; + inverted_bias = 0; + } + else { + invert = TRUE; + as_is_bias = 0; + inverted_bias = bias; + } + + /* Now invert each of the lists that contribute to the output, + * excluding from the result things outside the possible range */ + + /* For the unconditional inversion list, we have to add in all the + * conditional code points, so that when inverted, they will be gone + * from it */ + _invlist_union(only_utf8, invlist, &invlist); + _invlist_union(only_utf8_locale, invlist, &invlist); + _invlist_invert(invlist); + _invlist_intersection(invlist, PL_InBitmap, &invlist); + + if (only_utf8) { + _invlist_invert(only_utf8); + _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); + } + + if (not_utf8) { + _invlist_invert(not_utf8); + _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8); + } + + if (only_utf8_locale) { + _invlist_invert(only_utf8_locale); + _invlist_intersection(only_utf8_locale, + PL_InBitmap, + &only_utf8_locale); + } + + inverted_display = put_charclass_bitmap_innards_common( + invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, invert); + + /* Use the shortest representation, taking into account our bias + * against showing it inverted */ + if (SvCUR(inverted_display) + inverted_bias + < SvCUR(as_is_display) + as_is_bias) + { + sv_catsv(sv, inverted_display); + } + else { + sv_catsv(sv, as_is_display); + } + + SvREFCNT_dec_NN(as_is_display); + SvREFCNT_dec_NN(inverted_display); + } + + SvREFCNT_dec_NN(invlist); + SvREFCNT_dec(only_utf8); + SvREFCNT_dec(not_utf8); + SvREFCNT_dec(posixes); + SvREFCNT_dec(only_utf8_locale); + + return SvCUR(sv) > orig_sv_cur; } #define CLEAR_OPTSTART \