#define utf8_to_uvchr_buf(s, e, lenp) \
utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp, \
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
+#define utf8n_to_uvchr(s, len, lenp, flags) \
+ utf8n_to_uvchr_error(s, len, lenp, flags, 0)
#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
-#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
-#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0)
-#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0)
-#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0)
+
+#define to_utf8_fold(s, r, lenr) \
+ _to_utf8_fold_flags (s, NULL, r, lenr, FOLD_FLAGS_FULL, __FILE__, __LINE__)
+#define to_utf8_lower(s, r, lenr) \
+ _to_utf8_lower_flags(s, NULL, r ,lenr, 0, __FILE__, __LINE__)
+#define to_utf8_upper(s, r, lenr) \
+ _to_utf8_upper_flags(s, NULL, r, lenr, 0, __FILE__, __LINE__)
+#define to_utf8_title(s, r, lenr) \
+ _to_utf8_title_flags(s, NULL, r, lenr ,0, __FILE__, __LINE__)
#define foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
foldEQ_utf8_flags(s1, pe1, l1, u1, s2, pe2, l2, u2, 0)
*/
/*** GENERATED CODE ***/
#define is_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
- ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
-: ( 0xE0 == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \
- ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xF0 == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
+( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \
+ ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+: ( 0xE0 == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEF ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBF ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( ( ( ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
/* The above macro handles UTF-8 that has this start byte as the maximum */
#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7
*/
/*** GENERATED CODE ***/
#define is_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
- ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
-: ( 0xE0 == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || 0xEE == ((U8*)s)[0] ) ?\
- ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xED == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xEF == ((U8*)s)[0] ) ? \
- ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
- ( LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 ) \
- : ( 0xB7 == ((U8*)s)[1] ) ? \
- ( LIKELY( ( ((U8*)s)[2] & 0xF0 ) == 0x80 || ( ((U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\
- : ( ( 0xBF == ((U8*)s)[1] ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\
-: ( 0xF0 == ((U8*)s)[0] ) ? \
- ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
- ( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( ((U8*)s)[1] == 0x9F || ( ( ((U8*)s)[1] & 0xEF ) == 0xAF ) ) ? \
- ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
- ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
- : LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
+( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \
+ ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+: ( 0xE0 == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEC ) || 0xEE == ((const U8*)s)[0] ) ?\
+ ( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xED == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xEF == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x80 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBE ) ) ?\
+ ( LIKELY( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 ) \
+ : ( 0xB7 == ((const U8*)s)[1] ) ? \
+ ( LIKELY( ( ((const U8*)s)[2] & 0xF0 ) == 0x80 || ( ((const U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\
+ : ( ( 0xBF == ((const U8*)s)[1] ) && ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\
+: ( 0xF0 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBE ) ) ?\
+ ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+ : ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) ? \
+ ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \
+ ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
+ : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
: 0 ) \
-: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ? \
- ( ( ( ( ((U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\
- ( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( ( ((U8*)s)[1] & 0xCF ) == 0x8F ) ? \
- ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
- ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
- : LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
+: ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((const U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((const U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((const U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\
+ ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+ : ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) ? \
+ ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \
+ ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
+ : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
: 0 ) \
-: ( 0xF4 == ((U8*)s)[0] ) ? \
- ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x8E ) ? \
- ( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( 0x8F == ((U8*)s)[1] ) ? \
- ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
- ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
- : LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
+: ( 0xF4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x80 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0x8E ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+ : ( 0x8F == ((const U8*)s)[1] ) ? \
+ ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \
+ ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
+ : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
: 0 ) \
: 0 )
*/
/*** GENERATED CODE ***/
#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
- ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
-: ( 0xE0 == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || ( ((U8*)s)[0] & 0xFE ) == 0xEE ) ?\
- ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xED == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xF0 == ((U8*)s)[0] ) ? \
- ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ? \
- ( LIKELY( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: LIKELY( ( ( ( 0xF4 == ((U8*)s)[0] ) && ( ( ((U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
+( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \
+ ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+: ( 0xE0 == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEC ) || ( ((const U8*)s)[0] & 0xFE ) == 0xEE ) ?\
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xED == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((const U8*)s)[0] ) ? \
+ ( LIKELY( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBF ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \
+ ( LIKELY( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: LIKELY( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( ( ((const U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
#endif /* EBCDIC vs ASCII */
* beginning of a utf8 character. Now that foo_utf8() determines that itself,
* no need to do it again here
*/
-#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF) \
- ? isIDFIRST(*(p)) \
- : isIDFIRST_utf8((const U8*)p))
-#define isWORDCHAR_lazy_if(p,UTF) ((IN_BYTES || (!UTF)) \
- ? isWORDCHAR(*(p)) \
- : isWORDCHAR_utf8((const U8*)p))
-#define isALNUM_lazy_if(p,UTF) isWORDCHAR_lazy_if(p,UTF)
+#define isIDFIRST_lazy_if(p,UTF) \
+ _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isIDFIRST_lazy_if", \
+ "isIDFIRST_lazy_if_safe", \
+ cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
+
+#define isIDFIRST_lazy_if_safe(p, e, UTF) \
+ ((IN_BYTES || !UTF) \
+ ? isIDFIRST(*(p)) \
+ : isIDFIRST_utf8_safe(p, e))
+
+#define isWORDCHAR_lazy_if(p,UTF) \
+ _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isWORDCHAR_lazy_if", \
+ "isWORDCHAR_lazy_if_safe", \
+ cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
+
+#define isWORDCHAR_lazy_if_safe(p, e, UTF) \
+ ((IN_BYTES || !UTF) \
+ ? isWORDCHAR(*(p)) \
+ : isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
+
+#define isALNUM_lazy_if(p,UTF) \
+ _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isALNUM_lazy_if", \
+ "isWORDCHAR_lazy_if_safe", \
+ cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
#define UTF8_MAXLEN UTF8_MAXBYTES
/* A Unicode character can fold to up to 3 characters */
#define UTF8_MAX_FOLD_CHAR_EXPAND 3
-#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
+#define IN_BYTES UNLIKELY(CopHINTS_get(PL_curcop) & HINT_BYTES)
/*
* Is so within 'feature unicode_strings' or 'locale :not_characters', and not
* within 'use bytes'. UTF-8 locales are not tested for here, but perhaps
* could be */
-#define IN_UNI_8_BIT \
- (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT)) \
- || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \
- /* -1 below is for :not_characters */ \
- && _is_in_locale_category(FALSE, -1))) \
- && ! IN_BYTES)
+#define IN_UNI_8_BIT \
+ (( ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT)) \
+ || ( CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \
+ /* -1 below is for :not_characters */ \
+ && _is_in_locale_category(FALSE, -1))) \
+ && (! IN_BYTES))
#define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */
#define UTF8_ALLOW_SHORT 0x0008
#define UTF8_GOT_SHORT UTF8_ALLOW_SHORT
-/* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
+/* Overlong sequence; i.e., the code point can be specified in fewer bytes.
+ * First one will convert the overlong to the REPLACEMENT CHARACTER; second
+ * will return what the overlong evaluates to */
#define UTF8_ALLOW_LONG 0x0010
+#define UTF8_ALLOW_LONG_AND_ITS_VALUE (UTF8_ALLOW_LONG|0x0020)
#define UTF8_GOT_LONG UTF8_ALLOW_LONG
-/* Currently no way to allow overflow */
-#define UTF8_GOT_OVERFLOW 0x0020
+#define UTF8_ALLOW_OVERFLOW 0x0080
+#define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW
-#define UTF8_DISALLOW_SURROGATE 0x0040 /* Unicode surrogates */
+#define UTF8_DISALLOW_SURROGATE 0x0100 /* Unicode surrogates */
#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
-#define UTF8_WARN_SURROGATE 0x0080
+#define UTF8_WARN_SURROGATE 0x0200
-#define UTF8_DISALLOW_NONCHAR 0x0100 /* Unicode non-character */
+#define UTF8_DISALLOW_NONCHAR 0x0400 /* Unicode non-character */
#define UTF8_GOT_NONCHAR UTF8_DISALLOW_NONCHAR
-#define UTF8_WARN_NONCHAR 0x0200 /* code points */
+#define UTF8_WARN_NONCHAR 0x0800 /* code points */
-#define UTF8_DISALLOW_SUPER 0x0400 /* Super-set of Unicode: code */
+#define UTF8_DISALLOW_SUPER 0x1000 /* Super-set of Unicode: code */
#define UTF8_GOT_SUPER UTF8_DISALLOW_SUPER
-#define UTF8_WARN_SUPER 0x0800 /* points above the legal max */
+#define UTF8_WARN_SUPER 0x2000 /* points above the legal max */
/* Code points which never were part of the original UTF-8 standard, which only
* went up to 2 ** 31 - 1. Note that these all overflow a signed 32-bit word,
* The first byte of these code points is FE or FF on ASCII platforms. If the
* first byte is FF, it will overflow a 32-bit word. */
-#define UTF8_DISALLOW_ABOVE_31_BIT 0x1000
+#define UTF8_DISALLOW_ABOVE_31_BIT 0x4000
#define UTF8_GOT_ABOVE_31_BIT UTF8_DISALLOW_ABOVE_31_BIT
-#define UTF8_WARN_ABOVE_31_BIT 0x2000
+#define UTF8_WARN_ABOVE_31_BIT 0x8000
/* For back compat, these old names are misleading for UTF_EBCDIC */
#define UTF8_DISALLOW_FE_FF UTF8_DISALLOW_ABOVE_31_BIT
#define UTF8_WARN_FE_FF UTF8_WARN_ABOVE_31_BIT
-#define UTF8_CHECK_ONLY 0x4000
+#define UTF8_CHECK_ONLY 0x10000
+#define _UTF8_NO_CONFIDENCE_IN_CURLEN 0x20000 /* Internal core use only */
/* For backwards source compatibility. They do nothing, as the default now
* includes what they used to mean. The first one's meaning was to allow the
#define UTF8_WARN_ILLEGAL_INTERCHANGE \
(UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
-#define UTF8_ALLOW_ANY \
- (~( UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_DISALLOW_ABOVE_31_BIT \
- |UTF8_WARN_ILLEGAL_INTERCHANGE|UTF8_WARN_ABOVE_31_BIT))
-#define UTF8_ALLOW_ANYUV \
- (UTF8_ALLOW_EMPTY \
- & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE))
-#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
- UTF8_ALLOW_ANYUV)
+/* This is typically used for code that processes UTF-8 input and doesn't want
+ * to have to deal with any malformations that might be present. All such will
+ * be safely replaced by the REPLACEMENT CHARACTER, unless other flags
+ * overriding this are also present. */
+#define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
+ |UTF8_ALLOW_NON_CONTINUATION \
+ |UTF8_ALLOW_SHORT \
+ |UTF8_ALLOW_LONG \
+ |UTF8_ALLOW_OVERFLOW)
+
+/* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but
+ * not any malformed. This is the default. (Note that UVs above IV_MAX are
+ * deprecated. */
+#define UTF8_ALLOW_ANYUV 0
+#define UTF8_ALLOW_DEFAULT UTF8_ALLOW_ANYUV
/*
=for apidoc Am|bool|UTF8_IS_SURROGATE|const U8 *s|const U8 *e
(( LIKELY((e) > (s) + 4) \
&& NATIVE_UTF8_TO_I8(*(s)) >= 0xF9 \
&& ( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \
- || (NATIVE_UTF8_TO_I8(*(s) + 1) >= 0xA2)) \
+ || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \
&& LIKELY((s) + UTF8SKIP(s) <= (e))) \
? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
#else