Add, fix comments

author Karl Williamson <public@khwilliamson.com>

Thu, 14 Feb 2013 17:54:32 +0000 (10:54 -0700)

committer Karl Williamson <public@khwilliamson.com>

Mon, 25 Feb 2013 21:57:50 +0000 (14:57 -0700)
author Karl Williamson <public@khwilliamson.com>
Thu, 14 Feb 2013 17:54:32 +0000 (10:54 -0700)
committer Karl Williamson <public@khwilliamson.com>
Mon, 25 Feb 2013 21:57:50 +0000 (14:57 -0700)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index c364c83..808760d 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -2042,10 +2042,10 @@ package Input_file;
  # basically be a while(next_line()) {...} loop.
  #
  # You can also set up handlers to
-#   1) call before the first line is read for pre processing
+#   1) call before the first line is read, for pre processing
  #   2) call to adjust each line of the input before the main handler gets them
  #   3) call upon EOF before the main handler exits its loop
-#   4) call at the end for post processing
+#   4) call at the end, for post processing
  #
  # $_ is used to store the input line, and is to be filtered by the
  # each_line_handler()s.  So, if the format of the line is not in the desired
diff --git a/regcomp.c b/regcomp.c

index d8260e2..141ce91 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -15299,8 +15299,11 @@ S_put_byte(pTHX_ SV *sv, int c)
  
         EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
         ones (binary 1111 1111, hexadecimal FF). It is similar, but not
-       identical, to the ASCII delete (DEL) or rubout control character.
-       ) So the old condition can be simplified to !isPRINT(c)  */
+       identical, to the ASCII delete (DEL) or rubout control character. ...
+       it is typically mapped to hexadecimal code 9F, in order to provide a
+       unique character mapping in both directions)
+
+       So the old condition can be simplified to !isPRINT(c)  */
      if (!isPRINT(c)) {
         if (c < 256) {
             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
diff --git a/toke.c b/toke.c

index 49ff5fa..aace60b 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2863,7 +2863,8 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
  
    In patterns:
      expand:
-      \N{ABC}  => \N{U+41.42.43}
+      \N{FOO}  => \N{U+hex_for_character_FOO}
+      (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
  
      pass through:
         all other \-char, including \N and \N{ apart from \N{ABC}
diff --git a/utf8.c b/utf8.c

index 1bf3f52..ba1304e 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -90,7 +90,7 @@ Perl_is_ascii_string(const U8 *s, STRLEN len)
  /*
  =for apidoc uvuni_to_utf8_flags
  
-Adds the UTF-8 representation of the code point C<uv> to the end
+Adds the UTF-8 representation of the Unicode code point C<uv> to the end
  of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
@@ -109,6 +109,10 @@ This is the recommended Unicode-aware way of saying
  
      *(d++) = uv;
  
+where uv is a code point expressed in Latin-1 or above, not the platform's
+native character set.  B<Almost all code should instead use L</uvchr_to_utf8>
+or L</uvchr_to_utf8_flags>>.
+
  This function will convert to UTF-8 (and not warn) even code points that aren't
  legal Unicode or are problematic, unless C<flags> contains one or more of the
  following flags:
@@ -119,8 +123,9 @@ UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
  If both flags are set, the function will both warn and return NULL.
  
  The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
-affect how the function handles a Unicode non-character.  And, likewise for the
-UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
+affect how the function handles a Unicode non-character.  And likewise, the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
+code points that are
  above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
  even less portable) can be warned and/or disallowed even if other above-Unicode
  code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
@@ -258,7 +263,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
         return d;
      }
  #endif
-#endif /* Loop style */
+#endif /* Non loop style */
  }
  
  /*
@@ -275,7 +280,7 @@ or less you should use the IS_UTF8_CHAR(), for lengths of five or more
  you should use the _slow().  In practice this means that the _slow()
  will be used very rarely, since the maximum Unicode code point (as of
  Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
-the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
+the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
  five bytes or more.
  
  =cut */
diff --git a/utf8.h b/utf8.h

index f990f37..01d8f5f 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -136,7 +136,7 @@ END_EXTERN_C
     U+0800..U+0FFF      E0      * A0..BF    80..BF
     U+1000..U+CFFF       E1..EC    80..BF    80..BF
     U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
+   U+D800..U+DFFF       ED        A0..BF    80..BF  (surrogates)
     U+E000..U+FFFF       EE..EF    80..BF    80..BF
    U+10000..U+3FFFF     F0      * 90..BF    80..BF    80..BF
    U+40000..U+FFFFF     F1..F3    80..BF    80..BF    80..BF
@@ -144,7 +144,7 @@ END_EXTERN_C
      Below are non-Unicode code points
   U+110000..U+13FFFF    F4        90..BF    80..BF    80..BF
   U+110000..U+1FFFFF    F5..F7    80..BF    80..BF    80..BF
- U+200000:              F8..    * 88..BF    80..BF    80..BF    80..BF
+ U+200000..:            F8..    * 88..BF    80..BF    80..BF    80..BF
  
  Note the gaps before several of the byte entries above marked by '*'.  These are
  caused by legal UTF-8 avoiding non-shortest encodings: it is technically
@@ -275,6 +275,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  
  #define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)     /* a clearer synonym */
  
+/* Adds a UTF8 continuation byte 'new' of information to a running total code
+ * point 'old' of all the continuation bytes so far.  This is designed to be
+ * used in a loop to convert from UTF-8 to the code point represented */
  #define UTF8_ACCUMULATE(old, new)      (((old) << UTF_ACCUMULATION_SHIFT)     \
                                          | (((U8)new) & UTF_CONTINUATION_MASK))
  
diff --git a/utfebcdic.h b/utfebcdic.h

index 5705b96..c6001b2 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -152,7 +152,7 @@ unsigned char PL_utf8skip[] = {
   * remains 'A' */
  
  #if '^' == 95   /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */
-EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */
+EXTCONST unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (IBM-1047) */
   0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
   0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
@@ -171,7 +171,7 @@ EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */
   0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE
  };
  
-EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */
+EXTCONST unsigned char PL_e2utf[] = { /* UTFEBCDIC (IBM-1047) to I8 */
   0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
   0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
@@ -192,7 +192,7 @@ EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */
  #endif /* 1047 */
  
  #if '^' == 106  /* if defined(_OSD_POSIX) POSIX-BC */
-unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */
+unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (POSIX-BC) */
   0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
   0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
@@ -211,7 +211,7 @@ unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */
   0xDC, 0xC0, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xDD, 0xFC, 0xE0, 0xFE
  };
  
-unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */
+unsigned char PL_e2utf[] = { /* UTFEBCDIC (POSIX-BC) to I8 */
   0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
   0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
@@ -232,7 +232,7 @@ unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */
  #endif /* POSIX-BC */
  
  #if '^' == 176  /* if defined(??) (OS/400?) 037 */
-unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */
+unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (IBM-037) */
   0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
   0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
@@ -251,7 +251,7 @@ unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */
   0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE
  };
  
-unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to I8 */
+unsigned char PL_e2utf[] = { /* UTFEBCDIC (IBM-037) to I8 */
   0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
   0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
author	Karl Williamson <public@khwilliamson.com>
	Thu, 14 Feb 2013 17:54:32 +0000 (10:54 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Mon, 25 Feb 2013 21:57:50 +0000 (14:57 -0700)
lib/unicore/mktables		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history