send = s + len;
-#ifndef EBCDIC
-
/* This looks like 0x010101... */
-#define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
+# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
/* This looks like 0x808080... */
-#define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
-#define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER)
-#define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
+# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
+# define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER)
+# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
* or'ing together the lowest bits of 'x'. Hopefully the final term gets
* optimized out completely on a 32-bit system, and its mask gets optimized out
* on a 64-bit system */
-#define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
+# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
| ( PTR2nat(x) >> 1) \
| ( ( (PTR2nat(x) \
& PERL_WORD_BOUNDARY_MASK) >> 2))))
+#ifndef EBCDIC
+
/* Do the word-at-a-time iff there is at least one usable full word. That
* means that after advancing to a word boundary, there still is at least a
* full word left. The number of bytes needed to advance is 'wordsize -
return FALSE;
}
- /* Otherwise fall into final loop to find which byte it is */
+# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
+ || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ *ep = x + _variant_byte_number(* (PERL_UINTMAX_T *) x);
+ assert(*ep >= s && *ep < send);
+
+ return FALSE;
+
+# else /* If weird byte order, drop into next loop to do byte-at-a-time
+ checks. */
+
break;
+# endif
}
+
x += PERL_WORDSIZE;
+
} while (x + PERL_WORDSIZE <= send);
}
-#endif
+#endif /* End of ! EBCDIC */
/* Process per-byte */
while (x < send) {
return TRUE;
}
+#ifndef EBCDIC
+
+PERL_STATIC_INLINE unsigned int
+S__variant_byte_number(PERL_UINTMAX_T word)
+{
+
+ /* This returns the position in a word (0..7) of the first variant byte in
+ * it. This is a helper function. Note that there are no branches */
+
+ assert(word);
+
+ /* Get just the msb bits of each byte */
+ word &= PERL_VARIANTS_WORD_MASK;
+
+# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
+ easiest thing is to hide that from the callers */
+ {
+ unsigned int i;
+ const U8 * s = (U8 *) &word;
+ dTHX;
+
+ for (i = 0; i < sizeof(word); i++ ) {
+ if (s[i]) {
+ return i;
+ }
+ }
+
+ Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
+ __FILE__, __LINE__);
+ }
+
+# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
+
+ /* Bytes are stored like
+ * Byte8 ... Byte2 Byte1
+ * 63..56...15...8 7...0
+ *
+ * Isolate the lsb;
+ * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+ *
+ * The word will look this this, with a rightmost set bit in position 's':
+ * ('x's are don't cares)
+ * s
+ * x..x100..0
+ * x..xx10..0 Right shift (rightmost 0 is shifted off)
+ * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
+ * the 1 just to their left into a 0; the remainder is
+ * untouched
+ * 0..0011..1 The xor with x..xx10..0 clears that remainder, sets
+ * bottom to all 1
+ * 0..0100..0 Add 1 to clear the word except for the bit in 's'
+ *
+ * Another method is to do 'word &= -word'; but it generates a compiler
+ * message on some platforms about taking the negative of an unsigned */
+
+ word >>= 1;
+ word = 1 + (word ^ (word - 1));
+
+# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ /* Bytes are stored like
+ * Byte1 Byte2 ... Byte8
+ * 63..56 55..47 ... 7...0
+ *
+ * Isolate the msb; http://codeforces.com/blog/entry/10330
+ *
+ * Only the most significant set bit matters. Or'ing word with its right
+ * shift of 1 makes that bit and the next one to its right both 1. Then
+ * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
+ * msb and all to the right being 1. */
+ word |= word >> 1;
+ word |= word >> 2;
+ word |= word >> 4;
+ word |= word >> 8;
+ word |= word >> 16;
+ word |= word >> 32; /* This should get optimized out on 32-bit systems. */
+
+ /* Then subtracting the right shift by 1 clears all but the left-most of
+ * the 1 bits, which is our desired result */
+ word -= (word >> 1);
+
+# else
+# error Unexpected byte order
+# endif
+
+ /* Here 'word' has a single bit set: the msb of the first byte in which it
+ * is set. Calculate that position in the word. We can use this
+ * specialized solution: https://stackoverflow.com/a/32339674/1626653,
+ * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
+ * just get shifted off at compile time) */
+ word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
+ | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
+ | (39 << 24) | (47 << 16)
+ | (55 << 8) | (63 << 0));
+ word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
+
+ /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
+ word = ((word + 1) >> 3) - 1;
+
+# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ /* And invert the result */
+ word = CHARBITS - word - 1;
+
+# endif
+
+ return (unsigned int) word;
+}
+
+#endif
#if defined(PERL_CORE) || defined(PERL_EXT)
/*
/* Process per-word as long as we have at least a full word left */
do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
explanation of how this works */
- count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
+ count += (Size_t)
+ ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
* PERL_COUNT_MULTIPLIER)
>> ((PERL_WORDSIZE - 1) * CHARBITS);
x += PERL_WORDSIZE;
=cut
This is commonly used to determine if a SV's UTF-8 flag should be turned on.
-It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
-it otherwise contains invalid UTF-8.
+It generally needn't be if its string is entirely UTF-8 invariant, and it
+shouldn't be if it otherwise contains invalid UTF-8.
It is an internal function because khw thinks that XS code shouldn't be working
at this low a level. A valid use case could change that.