From 59645eb176f6a56df33554f3bedd0d3a7e071455 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 5 Dec 2019 12:26:21 -0700 Subject: [PATCH] Fix UTF8_IS_START on EBCDIC --- utf8.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/utf8.h b/utf8.h index 8c4cfd5..fa036f0 100644 --- a/utf8.h +++ b/utf8.h @@ -380,11 +380,19 @@ encoded as UTF-8. C is a native (ASCII or EBCDIC) code point if less than ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence? - * This doesn't catch invariants (they are single-byte). It also excludes the + * This excludes invariants (they are single-byte). It also excludes the * illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and - * C0-C4 I8 start bytes on EBCDIC ones */ -#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + * C0-C4 I8 start bytes on EBCDIC ones. On EBCDIC E0 can't start a + * non-overlong sequence, so we define a base macro and for those platforms, + * extend it to also exclude E0 */ +#define UTF8_IS_START_base(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE)) +#ifdef EBCDIC +# define UTF8_IS_START(c) \ + (UTF8_IS_START_base(c) && (c) != I8_TO_NATIVE_UTF8(0xE0)) +#else +# define UTF8_IS_START(c) UTF8_IS_START_base(c) +#endif #define UTF_MIN_ABOVE_LATIN1_BYTE \ ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) -- 1.8.3.1