END_EXTERN_C
-#if defined(_MSC_VER) && _MSC_VER < 1400
-/* older MSVC versions have a smallish macro buffer */
-#define PERL_SMALL_MACRO_BUFFER
-#endif
-
/*
=for apidoc Am|U8|NATIVE_TO_LATIN1|U8 ch
=cut
*/
-#ifdef PERL_SMALL_MACRO_BUFFER
-# define NATIVE_TO_LATIN1(ch) ((U8)(ch))
-# define LATIN1_TO_NATIVE(ch) ((U8)(ch))
-#else
-# define NATIVE_TO_LATIN1(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) ((ch) | 0)))
-# define LATIN1_TO_NATIVE(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) ((ch) | 0)))
-#endif
+#define NATIVE_TO_LATIN1(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) ((ch) | 0)))
+#define LATIN1_TO_NATIVE(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) ((ch) | 0)))
/* I8 is an intermediate version of UTF-8 used only in UTF-EBCDIC. We thus
* consider it to be identical to UTF-8 on ASCII platforms. Strictly speaking
* because they are 8-bit encodings that serve the same purpose in Perl, and
* rarely do we need to distinguish them. The term "NATIVE_UTF8" applies to
* whichever one is applicable on the current platform */
-#ifdef PERL_SMALL_MACRO_BUFFER
-#define NATIVE_UTF8_TO_I8(ch) ((U8) (ch))
-#define I8_TO_NATIVE_UTF8(ch) ((U8) (ch))
-#else
#define NATIVE_UTF8_TO_I8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) ((ch) | 0)))
#define I8_TO_NATIVE_UTF8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) ((ch) | 0)))
-#endif
#define UNI_TO_NATIVE(ch) ((UV) ((ch) | 0))
#define NATIVE_TO_UNI(ch) ((UV) ((ch) | 0))
((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
- * This doesn't catch invariants (they are single-byte). It also excludes the
+ * This excludes invariants (they are single-byte). It also excludes the
* illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and
- * C0-C4 I8 start bytes on EBCDIC ones */
-#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ * C0-C4 I8 start bytes on EBCDIC ones. On EBCDIC E0 can't start a
+ * non-overlong sequence, so we define a base macro and for those platforms,
+ * extend it to also exclude E0 */
+#define UTF8_IS_START_base(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE))
+#ifdef EBCDIC
+# define UTF8_IS_START(c) \
+ (UTF8_IS_START_base(c) && (c) != I8_TO_NATIVE_UTF8(0xE0))
+#else
+# define UTF8_IS_START(c) UTF8_IS_START_base(c)
+#endif
#define UTF_MIN_ABOVE_LATIN1_BYTE \
((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
=cut
*/
#define UTF8_MAXBYTES_CASE \
- (UTF8_MAXBYTES >= (UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF)) \
- ? UTF8_MAXBYTES \
- : (UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF)))
+ MAX(UTF8_MAXBYTES, UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF))
/* Rest of these are attributes of Unicode and perl's internals rather than the
* encoding, or happen to be the same in both ASCII and EBCDIC (at least at
* UTF-8 encoded character that mark it as a start byte and give the number of
* bytes that comprise the character. 'len' is the number of bytes in the
* multi-byte sequence. */
-#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
+#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : ((U8) (0xFE << (7-(len)))))
/* Masks out the initial one bits in a start byte, leaving the real data ones.
* Doesn't work on an invariant byte. 'len' is the number of bytes in the
* beginning of a utf8 character. Now that foo_utf8() determines that itself,
* no need to do it again here
*/
-#define isIDFIRST_lazy_if(p,UTF) \
- _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isIDFIRST_lazy_if", \
- "isIDFIRST_lazy_if_safe", \
- cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
#define isIDFIRST_lazy_if_safe(p, e, UTF) \
((IN_BYTES || !UTF) \
? isIDFIRST(*(p)) \
: isIDFIRST_utf8_safe(p, e))
-
-#define isWORDCHAR_lazy_if(p,UTF) \
- _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isWORDCHAR_lazy_if", \
- "isWORDCHAR_lazy_if_safe", \
- cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
#define isWORDCHAR_lazy_if_safe(p, e, UTF) \
((IN_BYTES || !UTF) \
? isWORDCHAR(*(p)) \
: isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
-
-#define isALNUM_lazy_if(p,UTF) \
- _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isALNUM_lazy_if", \
- "isWORDCHAR_lazy_if_safe", \
- cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
+#define isALNUM_lazy_if_safe(p, e, UTF) isWORDCHAR_lazy_if_safe(p, e, UTF)
#define UTF8_MAXLEN UTF8_MAXBYTES
#define UNICODE_DISALLOW_NONCHAR 0x0020
#define UNICODE_DISALLOW_SUPER 0x0040
#define UNICODE_DISALLOW_PERL_EXTENDED 0x0080
+
+#ifdef PERL_CORE
+# define UNICODE_ALLOW_ABOVE_IV_MAX 0x0100
+#endif
#define UNICODE_DISALLOW_ABOVE_31_BIT UNICODE_DISALLOW_PERL_EXTENDED
#define UNICODE_GOT_SURROGATE UNICODE_DISALLOW_SURROGATE
#define UNI_DISPLAY_ISPRINT 0x0001
#define UNI_DISPLAY_BACKSLASH 0x0002
-#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
+#define UNI_DISPLAY_BACKSPACE 0x0004 /* Allow \b when also
+ UNI_DISPLAY_BACKSLASH */
+#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT \
+ |UNI_DISPLAY_BACKSLASH \
+ |UNI_DISPLAY_BACKSPACE)
+
+/* Character classes could also allow \b, but not patterns in general */
#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
#define ANYOF_FOLD_SHARP_S(node, input, end) \