X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6f89c5a0e6dc613d9b45f50b12f5ad9b69d7a6df..ee16020279bc895096981c490d3477b7a8deebae:/utf8.h diff --git a/utf8.h b/utf8.h index 7cd163a..a93519a 100644 --- a/utf8.h +++ b/utf8.h @@ -72,6 +72,8 @@ the string is invariant. #define utf8_to_uvchr_buf(s, e, lenp) \ utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp, \ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) +#define utf8n_to_uvchr(s, len, lenp, flags) \ + utf8n_to_uvchr_error(s, len, lenp, flags, 0) #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL) #define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL) @@ -526,13 +528,6 @@ encoded as UTF-8. C is a native (ASCII or EBCDIC) code point if less than | ((NATIVE_UTF8_TO_I8((U8)new)) \ & UTF_CONTINUATION_MASK)) -/* If a value is anded with this, and the result is non-zero, then using the - * original value in UTF8_ACCUMULATE will overflow, shifting bits off the left - * */ -#define UTF_ACCUMULATION_OVERFLOW_MASK \ - (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) \ - - UTF_ACCUMULATION_SHIFT)) - /* This works in the face of malformed UTF-8. */ #define UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e) (UTF8_IS_DOWNGRADEABLE_START(*s) \ && ( (e) - (s) > 1) \ @@ -718,26 +713,37 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */ +#define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY /* Allow first byte to be a continuation byte */ #define UTF8_ALLOW_CONTINUATION 0x0002 +#define UTF8_GOT_CONTINUATION UTF8_ALLOW_CONTINUATION -/* Allow second... bytes to be non-continuation bytes */ +/* Unexpected continuation byte */ #define UTF8_ALLOW_NON_CONTINUATION 0x0004 +#define UTF8_GOT_NON_CONTINUATION UTF8_ALLOW_NON_CONTINUATION /* expecting more bytes than were available in the string */ #define UTF8_ALLOW_SHORT 0x0008 +#define UTF8_GOT_SHORT UTF8_ALLOW_SHORT /* Overlong sequence; i.e., the code point can be specified in fewer bytes. */ #define UTF8_ALLOW_LONG 0x0010 +#define UTF8_GOT_LONG UTF8_ALLOW_LONG + +/* Currently no way to allow overflow */ +#define UTF8_GOT_OVERFLOW 0x0020 #define UTF8_DISALLOW_SURROGATE 0x0040 /* Unicode surrogates */ +#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE #define UTF8_WARN_SURROGATE 0x0080 #define UTF8_DISALLOW_NONCHAR 0x0100 /* Unicode non-character */ +#define UTF8_GOT_NONCHAR UTF8_DISALLOW_NONCHAR #define UTF8_WARN_NONCHAR 0x0200 /* code points */ #define UTF8_DISALLOW_SUPER 0x0400 /* Super-set of Unicode: code */ +#define UTF8_GOT_SUPER UTF8_DISALLOW_SUPER #define UTF8_WARN_SUPER 0x0800 /* points above the legal max */ /* Code points which never were part of the original UTF-8 standard, which only @@ -745,6 +751,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag. * The first byte of these code points is FE or FF on ASCII platforms. If the * first byte is FF, it will overflow a 32-bit word. */ #define UTF8_DISALLOW_ABOVE_31_BIT 0x1000 +#define UTF8_GOT_ABOVE_31_BIT UTF8_DISALLOW_ABOVE_31_BIT #define UTF8_WARN_ABOVE_31_BIT 0x2000 /* For back compat, these old names are misleading for UTF_EBCDIC */ @@ -772,9 +779,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_ALLOW_ANY \ (~( UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_DISALLOW_ABOVE_31_BIT \ |UTF8_WARN_ILLEGAL_INTERCHANGE|UTF8_WARN_ABOVE_31_BIT)) -#define UTF8_ALLOW_ANYUV \ - (UTF8_ALLOW_EMPTY \ - & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) +#define UTF8_ALLOW_ANYUV UTF8_ALLOW_EMPTY #define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \ UTF8_ALLOW_ANYUV)