- left_utf = DO_UTF8(left);
- right_utf = DO_UTF8(right);
-
- if (left_utf && !right_utf) {
- /* Avoid triggering overloading again by using temporaries.
- Maybe there should be a variant of sv_utf8_upgrade that takes pvn
- */
- right = newSVpvn_flags(rsave, rightlen, SVs_TEMP);
- sv_utf8_upgrade(right);
- rsave = rc = SvPV_nomg_const(right, rightlen);
- right_utf = TRUE;
+ /* Create downgraded temporaries of any UTF-8 encoded operands */
+ if (DO_UTF8(left)) {
+ const U8 * save_lc = (U8 *) lc;
+
+ left_utf8 = TRUE;
+ result_needs_to_be_utf8 = TRUE;
+
+ left_non_downgraded_len = leftlen;
+ lc = (char *) bytes_from_utf8_loc((const U8 *) lc, &leftlen,
+ &left_utf8,
+ (const U8 **) &left_non_downgraded);
+ /* Calculate the number of trailing unconvertible bytes. This quantity
+ * is the original length minus the length of the converted portion. */
+ left_non_downgraded_len -= left_non_downgraded - save_lc;
+ SAVEFREEPV(lc);
+ }
+ if (DO_UTF8(right)) {
+ const U8 * save_rc = (U8 *) rc;
+
+ right_utf8 = TRUE;
+ result_needs_to_be_utf8 = TRUE;
+
+ right_non_downgraded_len = rightlen;
+ rc = (char *) bytes_from_utf8_loc((const U8 *) rc, &rightlen,
+ &right_utf8,
+ (const U8 **) &right_non_downgraded);
+ right_non_downgraded_len -= right_non_downgraded - save_rc;
+ SAVEFREEPV(rc);
+ }
+
+ /* We set 'len' to the length that the operation actually operates on. The
+ * dangling part of the longer operand doesn't actually participate in the
+ * operation. What happens is that we pretend that the shorter operand has
+ * been extended to the right by enough imaginary zeros to match the length
+ * of the longer one. But we know in advance the result of the operation
+ * on zeros without having to do it. In the case of '&', the result is
+ * zero, and the dangling portion is simply discarded. For '|' and '^', the
+ * result is the same as the other operand, so the dangling part is just
+ * appended to the final result, unchanged. We currently accept above-FF
+ * code points in the dangling portion, as that's how it has long worked,
+ * and code depends on it staying that way. But it is now fatal for
+ * above-FF to appear in the portion that does get operated on. Hence, any
+ * above-FF must come only in the longer operand, and only in its dangling
+ * portion. That means that at least one of the operands has to be
+ * entirely non-UTF-8, and the length of that operand has to be before the
+ * first above-FF in the other */
+ if (left_utf8) {
+ if (right_utf8 || rightlen > leftlen) {
+ Perl_croak(aTHX_ fatal_above_ff_msg, PL_op_desc[optype]);
+ }
+ len = rightlen;