utfebcdic.h

   1 /*    utfebcdic.h
   2  *
   3  *    Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009,
   4  *    2010, 2011 by Larry Wall, Nick Ing-Simmons, and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  * Macros to implement UTF-EBCDIC as perl's internal encoding
  10  * Adapted from version 7.1 of Unicode Technical Report #16:
  11  *  http://www.unicode.org/unicode/reports/tr16
  12  *
  13  * To summarize, the way it works is:
  14  * To convert an EBCDIC character to UTF-EBCDIC:
  15  *  1)  convert to Unicode.  The table in the generated file 'ebcdic_tables.h'
  16  *      that does this for EBCDIC bytes is PL_e2a (with inverse PL_a2e).  The
  17  *      'a' stands for ASCII platform, meaning latin1.
  18  *  2)  convert that to a utf8-like string called I8 ('I' stands for
  19  *      intermediate) with variant characters occupying multiple bytes.  This
  20  *      step is similar to the utf8-creating step from Unicode, but the details
  21  *      are different.  This transformation is called UTF8-Mod.  There is a
  22  *      chart about the bit patterns in a comment later in this file.  But
  23  *      essentially here are the differences:
  24  *                          UTF8                I8
  25  *      invariant byte      starts with 0       starts with 0 or 100
  26  *      continuation byte   starts with 10      starts with 101
  27  *      start byte          same in both: if the code point requires N bytes,
  28  *                          then the leading N bits are 1, followed by a 0.  (No
  29  *                          trailing 0 for the very largest possible allocation
  30  *                          in I8, far beyond the current Unicode standard's
  31  *                          max, as shown in the comment later in this file.)
  32  *  3)  Use the algorithm in tr16 to convert each byte from step 2 into
  33  *      final UTF-EBCDIC.  This is done by table lookup from a table
  34  *      constructed from the algorithm, reproduced in ebcdic_tables.h as
  35  *      PL_utf2e, with its inverse being PL_e2utf.  They are constructed so that
  36  *      all EBCDIC invariants remain invariant, but no others do, and the first
  37  *      byte of a variant will always have its upper bit set.  But note that
  38  *      the upper bit of some invariants is also 1.
  39  *
  40  *  For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in
  41  *  UTF-EBCDIC.  Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3
  42  *  converts it back to 193.  As an example of how a variant character works,
  43  *  take LATIN SMALL LETTER Y WITH DIAERESIS, which is typically 0xDF in
  44  *  EBCDIC.  Step 1 converts it to the Unicode value, 0xFF.  Step 2 converts
  45  *  that to two bytes = 11000111 10111111 = C7 BF, and Step 3 converts those to
  46  *  0x8B 0x73.
  47  *
  48  * If you're starting from Unicode, skip step 1.  For UTF-EBCDIC to straight
  49  * EBCDIC, reverse the steps.
  50  *
  51  * The EBCDIC invariants have been chosen to be those characters whose Unicode
  52  * equivalents have ordinal numbers less than 160, that is the same characters
  53  * that are expressible in ASCII, plus the C1 controls.  So there are 160
  54  * invariants instead of the 128 in UTF-8.
  55  *
  56  * The purpose of Step 3 is to make the encoding be invariant for the chosen
  57  * characters.  This messes up the convenient patterns found in step 2, so
  58  * generally, one has to undo step 3 into a temporary to use them.  However,
  59  * one "shadow", or parallel table, PL_utf8skip, has been constructed that
  60  * doesn't require undoing things.  It is such that for each byte, it says
  61  * how long the sequence is if that (UTF-EBCDIC) byte were to begin it
  62  *
  63  * There are actually 3 slightly different UTF-EBCDIC encodings in
  64  * ebcdic_tables.h, one for each of the code pages recognized by Perl.  That
  65  * means that there are actually three different sets of tables, one for each
  66  * code page.  (If Perl is compiled on platforms using another EBCDIC code
  67  * page, it may not compile, or Perl may silently mistake it for one of the
  68  * three.)
  69  *
  70  * Note that tr16 actually only specifies one version of UTF-EBCDIC, based on
  71  * the 1047 encoding, and which is supposed to be used for all code pages.
  72  * But this doesn't work.  To illustrate the problem, consider the '^' character.
  73  * On a 037 code page it is the single byte 176, whereas under 1047 UTF-EBCDIC
  74  * it is the single byte 95.  If Perl implemented tr16 exactly, it would mean
  75  * that changing a string containing '^' to UTF-EBCDIC would change that '^'
  76  * from 176 to 95 (and vice-versa), violating the rule that ASCII-range
  77  * characters are the same in UTF-8 or not.  Much code in Perl assumes this
  78  * rule.  See for example
  79  * http://grokbase.com/t/perl/mvs/025xf0yhmn/utf-ebcdic-for-posix-bc-malformed-utf-8-character
  80  * What Perl does is create a version of UTF-EBCDIC suited to each code page;
  81  * the one for the 1047 code page is identical to what's specified in tr16.
  82  * This complicates interchanging files between computers using different code
  83  * pages.  Best is to convert to I8 before sending them, as the I8
  84  * representation is the same no matter what the underlying code page is.
  85  *
  86  * Because of the way UTF-EBCDIC is constructed, the lowest 32 code points that
  87  * aren't equivalent to ASCII characters nor C1 controls form the set of
  88  * continuation bytes; the remaining 64 non-ASCII, non-control code points form
  89  * the potential start bytes, in order.  (However, the first 5 of these lead to
  90  * malformed overlongs, so there really are only 59 start bytes.) Hence the
  91  * UTF-EBCDIC for the smallest variant code point, 0x160, will have likely 0x41
  92  * as its continuation byte, provided 0x41 isn't an ASCII or C1 equivalent.
  93  * And its start byte will be the code point that is 37 (32+5) non-ASCII,
  94  * non-control code points past it.  (0 - 3F are controls, and 40 is SPACE,
  95  * leaving 41 as the first potentially available one.)  In contrast, on ASCII
  96  * platforms, the first 64 (not 32) non-ASCII code points are the continuation
  97  * bytes.  And the first 2 (not 5) potential start bytes form overlong
  98  * malformed sequences.
  99  *
 100  * EBCDIC characters above 0xFF are the same as Unicode in Perl's
 101  * implementation of all 3 encodings, so for those Step 1 is trivial.
 102  *
 103  * (Note that the entries for invariant characters are necessarily the same in
 104  * PL_e2a and PL_e2utf; likewise for their inverses.)
 105  *
 106  * UTF-EBCDIC strings are the same length or longer than UTF-8 representations
 107  * of the same string.  The maximum code point representable as 2 bytes in
 108  * UTF-EBCDIC is 0x3FFF, instead of 0x7FFF in UTF-8.
 109  */
 110
 111 START_EXTERN_C
 112
 113 #ifdef DOINIT
 114
 115 #include "ebcdic_tables.h"
 116
 117 #else
 118 EXTCONST U8 PL_utf8skip[];
 119 EXTCONST U8 PL_e2utf[];
 120 EXTCONST U8 PL_utf2e[];
 121 EXTCONST U8 PL_e2a[];
 122 EXTCONST U8 PL_a2e[];
 123 EXTCONST U8 PL_fold[];
 124 EXTCONST U8 PL_fold_latin1[];
 125 EXTCONST U8 PL_latin1_lc[];
 126 EXTCONST U8 PL_mod_latin1_uc[];
 127 #endif
 128
 129 END_EXTERN_C
 130
 131 /* EBCDIC-happy ways of converting native code to UTF-8 */
 132
 133 #define NATIVE_TO_LATIN1(ch)            PL_e2a[(U8)(ch)]
 134 #define LATIN1_TO_NATIVE(ch)            PL_a2e[(U8)(ch)]
 135
 136 #define NATIVE_UTF8_TO_I8(ch)           PL_e2utf[(U8)(ch)]
 137 #define I8_TO_NATIVE_UTF8(ch)           PL_utf2e[(U8)(ch)]
 138
 139 /* Transforms in wide UV chars */
 140 #define NATIVE_TO_UNI(ch)        (((ch) > 255) ? (ch) : NATIVE_TO_LATIN1(ch))
 141 #define UNI_TO_NATIVE(ch)        (((ch) > 255) ? (ch) : LATIN1_TO_NATIVE(ch))
 142
 143 /*
 144   The following table is adapted from tr16, it shows I8 encoding of Unicode code points.
 145
 146         Unicode                             Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte
 147     U+0000..U+007F                     000000000xxxxxxx 0xxxxxxx
 148     U+0080..U+009F                     00000000100xxxxx 100xxxxx
 149     U+00A0..U+03FF                     000000yyyyyxxxxx 110yyyyy 101xxxxx
 150     U+0400..U+3FFF                     00zzzzyyyyyxxxxx 1110zzzz 101yyyyy 101xxxxx
 151     U+4000..U+3FFFF                 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx
 152    U+40000..U+3FFFFF            0vvwwwwwzzzzzyyyyyxxxxx 111110vv 101wwwww 101zzzzz 101yyyyy 101xxxxx
 153   U+400000..U+3FFFFFF       0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx
 154  U+4000000..U+7FFFFFFF 0tuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 1111111t 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx
 155
 156   Note: The I8 transformation is valid for UCS-4 values X'0' to
 157   X'7FFFFFFF' (the full extent of ISO/IEC 10646 coding space).
 158
 159  */
 160
 161 /* Input is a true Unicode (not-native) code point */
 162 #define OFFUNISKIP(uv) ( (uv) < 0xA0        ? 1 : \
 163                       (uv) < 0x400          ? 2 : \
 164                       (uv) < 0x4000         ? 3 : \
 165                       (uv) < 0x40000        ? 4 : \
 166                       (uv) < 0x400000       ? 5 : \
 167                       (uv) < 0x4000000      ? 6 : 7 )
 168
 169 #define UNI_IS_INVARIANT(c)             (((UV)(c)) <  0xA0)
 170
 171 /* UTF-EBCDIC semantic macros - transform back into I8 and then compare
 172  * Comments as to the meaning of each are given at their corresponding utf8.h
 173  * definitions */
 174
 175 #define UTF8_IS_START(c)                (NATIVE_UTF8_TO_I8(c) >= 0xC5     \
 176                                          && NATIVE_UTF8_TO_I8(c) != 0xE0)
 177 #define UTF8_IS_CONTINUATION(c)         ((NATIVE_UTF8_TO_I8(c) & 0xE0) == 0xA0)
 178 #define UTF8_IS_CONTINUED(c)            (NATIVE_UTF8_TO_I8(c) >= 0xA0)
 179
 180 #define UTF8_IS_DOWNGRADEABLE_START(c)  (NATIVE_UTF8_TO_I8(c) >= 0xC5     \
 181                                          && NATIVE_UTF8_TO_I8(c) <= 0xC7)
 182 /* Saying it this way adds a runtime test, but removes 2 run-time lookups */
 183 /*#define UTF8_IS_DOWNGRADEABLE_START(c)  ((c) == I8_TO_NATIVE_UTF8(0xC5)     \
 184                                          || (c) == I8_TO_NATIVE_UTF8(0xC6)  \
 185                                          || (c) == I8_TO_NATIVE_UTF8(0xC7))
 186 */
 187 #define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_UTF8_TO_I8(c) >= 0xC8)
 188
 189 /* Can't exceed 7 on EBCDIC platforms */
 190 #define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len))))
 191
 192 #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2)))
 193 #define UTF_CONTINUATION_MARK           0xA0
 194 #define UTF_CONTINUATION_MASK           ((U8)0x1f)
 195 #define UTF_ACCUMULATION_SHIFT          5
 196
 197 /* How wide can a single UTF-8 encoded character become in bytes. */
 198 /* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
 199  * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
 200  * expressed with 5 bytes.  However, Perl thinks of UTF-8 as a way to encode
 201  * non-negative integers in a binary format, even those above Unicode */
 202 #define UTF8_MAXBYTES 7
 203
 204 /* The maximum number of UTF-8 bytes a single Unicode character can
 205  * uppercase/lowercase/fold into.  Unicode guarantees that the maximum
 206  * expansion is 3 characters.  On EBCDIC platforms, the highest Unicode
 207  * character occupies 5 bytes, therefore this number is 15 */
 208 #define UTF8_MAXBYTES_CASE      15
 209
 210 /* ^? is defined to be APC on EBCDIC systems.  See the definition of toCTRL()
 211  * for more */
 212 #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)
 213
 214 #define MAX_UTF8_TWO_BYTE 0x3FF
 215
 216 /*
 217  * ex: set ts=8 sts=4 sw=4 et:
 218  */