struct regnode_charclass_class *start_class;
} scan_data_t;
+/* The below is perhaps overboard, but this allows us to save a test at the
+ * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
+ * and 'a' differ by a single bit; the same with the upper and lower case of
+ * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
+ * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit position
+ * where the upper- and lowercase differ. XXX There are about 40 other
+ * instances in the Perl core where this micro-optimization could be used.
+ * Should decide if maintenance cost is worse, before changing those
+ *
+ * Returns a boolean as to whether or not 'v' is either a lowercase or
+ * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
+ * compile-time constant, the generated code is better than some optimizing
+ * compilers figure out, amounting to a mask and test. The results are
+ * meaningless if 'c' is not one of [A-Za-z] */
+#define isARG2_lower_or_UPPER_ARG1(c, v) \
+ (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
+
/*
* Forward declarations for pregcomp()'s friends.
*/
* have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
- /* The below is perhaps overboard, but this allows us to save a
- * test each time through the loop at the expense of a mask. This
- * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
- * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
- * are 64. This uses an exclusive 'or' to find that bit and then
- * inverts it to form a mask, with just a single 0, in the bit
- * position where 'S' and 's' differ. */
- const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
- const U8 s_masked = 's' & S_or_s_mask;
-
while (s < upper) {
int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
}
if (len == 2
- && ((*s & S_or_s_mask) == s_masked)
- && ((*(s+1) & S_or_s_mask) == s_masked))
+ && isARG2_lower_or_UPPER_ARG1('s', *s)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
{
/* EXACTF nodes need to know that the minimum length
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ if ( n == RX_BUFF_IDX_CARET_PREMATCH
|| n == RX_BUFF_IDX_CARET_FULLMATCH
|| n == RX_BUFF_IDX_CARET_POSTMATCH
- )
- && !(rx->extflags & RXf_PMf_KEEPCOPY)
- )
- goto ret_undef;
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto ret_undef;
+ }
if (!rx->subbeg)
goto ret_undef;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
+ if ( paren == RX_BUFF_IDX_CARET_PREMATCH
+ || paren == RX_BUFF_IDX_CARET_FULLMATCH
+ || paren == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ {
+ bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+ if (!keepcopy) {
+ /* on something like
+ * $r = qr/.../;
+ * /$qr/p;
+ * the KEEPCOPY is set on the PMOP rather than the regex */
+ if (PL_curpm && r == PM_GETRE(PL_curpm))
+ keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+ }
+ if (!keepcopy)
+ goto warn_undef;
+ }
+
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
switch (paren) {
case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
return 0;
case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
}
return 0;
- case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
- if (!(rx->extflags & RXf_PMf_KEEPCOPY))
- goto warn_undef;
- /*FALLTHROUGH*/
-
- /* $& / ${^MATCH}, $1, $2, ... */
- default:
+ default: /* $& / ${^MATCH}, $1, $2, ... */
if (paren <= (I32)rx->nparens &&
(s1 = rx->offs[paren].start) != -1 &&
(t1 = rx->offs[paren].end) != -1)
}
#endif
-#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
+#ifndef PERL_IN_XSUB_RE
void
-Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
+Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
{
- /* Dumps out the ranges in an inversion list. The string 'header'
- * if present is output on a line before the first range */
+ /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
+ * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
+ * the string 'indent'. The output looks like this:
+ [0] 0x000A .. 0x000D
+ [2] 0x0085
+ [4] 0x2028 .. 0x2029
+ [6] 0x3104 .. INFINITY
+ * This means that the first range of code points matched by the list are
+ * 0xA through 0xD; the second range contains only the single code point
+ * 0x85, etc. An inversion list is an array of UVs. Two array elements
+ * are used to define each range (except if the final range extends to
+ * infinity, only a single element is needed). The array index of the
+ * first element for the corresponding range is given in brackets. */
UV start, end;
+ STRLEN count = 0;
PERL_ARGS_ASSERT__INVLIST_DUMP;
- if (header && strlen(header)) {
- PerlIO_printf(Perl_debug_log, "%s\n", header);
- }
if (invlist_is_iterating(invlist)) {
- PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
+ Perl_dump_indent(aTHX_ level, file,
+ "%sCan't dump inversion list because is in middle of iterating\n",
+ indent);
return;
}
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+ indent, (UV)count, start);
}
else if (end != start) {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
- start, end);
+ Perl_dump_indent(aTHX_ level, file,
+ "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+ indent, (UV)count, start, end);
}
else {
- PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
+ Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+ indent, (UV)count, start);
}
+ count += 2;
}
}
#endif
defchar: {
STRLEN len = 0;
- UV ender;
+ UV ender = 0;
char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
- U8 node_type;
+ U8 node_type = compute_EXACTish(pRExC_state);
bool next_is_quantifier;
char * oldp = NULL;
+ /* We can convert EXACTF nodes to EXACTFU if they contain only
+ * characters that match identically regardless of the target
+ * string's UTF8ness. The reason to do this is that EXACTF is not
+ * trie-able, EXACTFU is. (We don't need to figure this out until
+ * pass 2) */
+ bool maybe_exactfu = node_type == EXACTF && PASS2;
+
/* If a folding node contains only code points that don't
* participate in folds, it can be changed into an EXACT node,
* which allows the optimizer more things to look for */
bool maybe_exact;
- ender = 0;
- node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
/* In pass1, folded, we use a temporary buffer instead of the
/* We do the EXACTFish to EXACT node only if folding, and not if in
* locale, as whether a character folds or not isn't known until
- * runtime */
- maybe_exact = FOLD && ! LOC;
+ * runtime. (And we don't need to figure this out until pass 2) */
+ maybe_exact = FOLD && ! LOC && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
|| (node_type == EXACTFU
&& ender == LATIN_SMALL_LETTER_SHARP_S)))
{
+ if (IS_IN_SOME_FOLD_L1(ender)) {
+ maybe_exact = FALSE;
+
+ /* See if the character's fold differs between /d and
+ * /u. This includes the multi-char fold SHARP S to
+ * 'ss' */
+ if (maybe_exactfu
+ && (PL_fold[ender] != PL_fold_latin1[ender]
+ || ender == LATIN_SMALL_LETTER_SHARP_S
+ || (len > 0
+ && isARG2_lower_or_UPPER_ARG1('s', ender)
+ && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
+ {
+ maybe_exactfu = FALSE;
+ }
+ }
*(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
}
else { /* UTF */
* do any better */
if (len == 0) {
len = full_len;
+
+ /* If the node ends in an 's' we make sure it stays EXACTF,
+ * as if it turns into an EXACTFU, it could later get
+ * joined with another 's' that would then wrongly match
+ * the sharp s */
+ if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ {
+ maybe_exactfu = FALSE;
+ }
} else {
/* Here, the node does contain some characters that aren't
if (len == 0) {
OP(ret) = NOTHING;
}
- else{
-
- /* If 'maybe_exact' is still set here, means there are no
- * code points in the node that participate in folds */
- if (FOLD && maybe_exact) {
- OP(ret) = EXACT;
+ else {
+ if (FOLD) {
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds;
+ * similarly for 'maybe_exactfu' and code points that match
+ * differently depending on UTF8ness of the target string
+ * */
+ if (maybe_exact) {
+ OP(ret) = EXACT;
+ }
+ else if (maybe_exactfu) {
+ OP(ret) = EXACTFU;
+ }
}
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
}
sv_catpvs(sv, "{unicode_all}");
else if (ANYOF_NONBITMAP(o)) {
SV *lv; /* Set if there is something outside the bit map. */
+ bool byte_output = FALSE; /* If something in the bitmap has been
+ output */
if (flags & ANYOF_NONBITMAP_NON_UTF8) {
sv_catpvs(sv, "{outside bitmap}");
}
/* Get the stuff that wasn't in the bitmap */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
- bool byte_output = FALSE; /* If something in the bitmap has been
- output */
+ (void) regclass_swash(prog, o, FALSE, &lv, NULL);
if (lv && lv != &PL_sv_undef) {
char *s = savesvpv(lv);
char * const origs = s;