X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1520a685b7fb85729eee719408471f380b80a87a..11d7de88d35184d288f54028ef5ae07962378fb1:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 3283054..659d51f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -367,6 +367,24 @@ typedef struct scan_data_t { struct regnode_charclass_class *start_class; } scan_data_t; +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + /* * Forward declarations for pregcomp()'s friends. */ @@ -2938,16 +2956,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * have to find at least two characters for a multi-fold */ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - /* The below is perhaps overboard, but this allows us to save a - * test each time through the loop at the expense of a mask. This - * is because on both EBCDIC and ASCII machines, 'S' and 's' differ - * by a single bit. On ASCII they are 32 apart; on EBCDIC, they - * are 64. This uses an exclusive 'or' to find that bit and then - * inverts it to form a mask, with just a single 0, in the bit - * position where 'S' and 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; - while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ @@ -2960,8 +2968,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } if (len == 2 - && ((*s & S_or_s_mask) == s_masked) - && ((*(s+1) & S_or_s_mask) == s_masked)) + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) { /* EXACTF nodes need to know that the minimum length @@ -8293,37 +8301,53 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP +#ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist) { - /* Dumps out the ranges in an inversion list. The string 'header' - * if present is output on a line before the first range */ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ UV start, end; + STRLEN count = 0; PERL_ARGS_ASSERT__INVLIST_DUMP; - if (header && strlen(header)) { - PerlIO_printf(Perl_debug_log, "%s\n", header); - } if (invlist_is_iterating(invlist)) { - PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n"); + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); return; } invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); } else if (end != start) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", - start, end); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); } + count += 2; } } #endif @@ -10728,7 +10752,7 @@ tryagain: defchar: { STRLEN len = 0; - UV ender; + UV ender = 0; char *p; char *s; #define MAX_NODE_STRING_SIZE 127 @@ -10736,17 +10760,22 @@ tryagain: char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; STRLEN foldlen; - U8 node_type; + U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. (We don't need to figure this out until + * pass 2) */ + bool maybe_exactfu = node_type == EXACTF && PASS2; + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ bool maybe_exact; - ender = 0; - node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -10759,8 +10788,8 @@ tryagain: /* We do the EXACTFish to EXACT node only if folding, and not if in * locale, as whether a character folds or not isn't known until - * runtime */ - maybe_exact = FOLD && ! LOC; + * runtime. (And we don't need to figure this out until pass 2) */ + maybe_exact = FOLD && ! LOC && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -11074,8 +11103,23 @@ tryagain: || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', *(s-1))))) + { + maybe_exactfu = FALSE; + } + } *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); } else { /* UTF */ @@ -11264,6 +11308,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11322,12 +11375,19 @@ tryagain: if (len == 0) { OP(ret) = NOTHING; } - else{ - - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } } alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); }