});
}
else { /* longjumped back */
- STRLEN len = plen;
+ U8 *src, *dst;
+ int n=0;
+ STRLEN s = 0, d = 0;
+ bool do_end = 0;
/* If the cause for the longjmp was other than changing to utf8, pop
* our own setjmp, and longjmp to the correct handler */
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
- if (!pat_count) {
- assert(expr && expr->op_type == OP_LIST);
- sv_setpvn(pat, "", 0);
- SvUTF8_on(pat);
- S_get_pat_and_code_indices(aTHX_ pRExC_state, expr, pat);
- exp = SvPV(pat, plen);
- xend = exp + plen;
- }
- else {
- exp = (char*)Perl_bytes_to_utf8(aTHX_
- (U8*)SvPV_nomg(pat, plen),
- &len);
- xend = exp + len;
- SAVEFREEPV(exp);
+ /* upgrade pattern to UTF8, and if there are code blocks,
+ * recalculate the indices.
+ * This is essentially an unrolled Perl_bytes_to_utf8() */
+
+ src = (U8*)SvPV_nomg(pat, plen);
+ Newx(dst, plen * 2 + 1, U8);
+
+ while (s < plen) {
+ const UV uv = NATIVE_TO_ASCII(src[s]);
+ if (UNI_IS_INVARIANT(uv))
+ dst[d] = (U8)UTF_TO_NATIVE(uv);
+ else {
+ dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
+ dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ }
+ if (n < pRExC_state->num_code_blocks) {
+ if (!do_end && pRExC_state->code_blocks[n].start == s) {
+ pRExC_state->code_blocks[n].start = d;
+ assert(dst[d] == '(');
+ do_end = 1;
+ }
+ else if (do_end && pRExC_state->code_blocks[n].end == s) {
+ pRExC_state->code_blocks[n].end = d;
+ assert(dst[d] == ')');
+ do_end = 0;
+ n++;
+ }
+ }
+ s++;
+ d++;
}
+ dst[d] = '\0';
+ plen = d;
+ exp = (char*) dst;
+ xend = exp + plen;
+ SAVEFREEPV(exp);
RExC_orig_utf8 = RExC_utf8 = 1;
}
}
-plan tests => 220; # Update this when adding/deleting tests.
+plan tests => 241; # Update this when adding/deleting tests.
run_tests() unless caller;
use re 'eval';
for my $x (qw(a b c)) {
my $bc = ($x ne 'a');
+ my $c80 = chr(0x80);
# the most basic: literal code should be in same scope
# as the parent
- ok("A$x" =~ /^A(??{$x})$/, "[$x] literal code");
+ ok("A$x" =~ /^A(??{$x})$/, "[$x] literal code");
+ ok("\x{100}$x" =~ /^\x{100}(??{$x})$/, "[$x] literal code UTF8");
# the "don't recompile if pattern unchanged" mechanism
# shouldn't apply to code blocks - recompile every time
# to pick up new instances of variables
- my $code1 = 'B(??{$x})';
- tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code");
+ my $code1 = 'B(??{$x})';
+ my $code1u = $c80 . "\x{100}" . '(??{$x})';
+ tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code AA");
+ tok($bc, "A$c80\x{100}$x" =~ /^A$code1u$/,
+ "[$x] unvarying runtime code AU");
+ tok($bc, "$c80\x{100}B$x" =~ /^$c80\x{100}$code1$/,
+ "[$x] unvarying runtime code UA");
+ tok($bc, "$c80\x{101}$c80\x{100}$x" =~ /^$c80\x{101}$code1u$/,
+ "[$x] unvarying runtime code UU");
# mixed literal and run-time code blocks
- my $code2 = 'B(??{$x})';
- tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/, "[$x] literal+runtime");
+ my $code2 = 'B(??{$x})';
+ my $code2u = $c80 . "\x{100}" . '(??{$x})';
+ tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/,
+ "[$x] literal+runtime AA");
+ tok($bc, "A$x-$c80\x{100}$x" =~ /^A(??{$x})-$code2u$/,
+ "[$x] literal+runtime AU");
+ tok($bc, "$c80\x{100}$x-B$x" =~ /^$c80\x{100}(??{$x})-$code2$/,
+ "[$x] literal+runtime UA");
+ tok($bc, "$c80\x{101}$x-$c80\x{100}$x"
+ =~ /^$c80\x{101}(??{$x})-$code2u$/,
+ "[$x] literal+runtime UU");
# literal qr code only created once, naked