Move finding perl versions from mktodo to devtools.pl

[perl5.git] / utf8.h
diff --git a/utf8.h b/utf8.h

index 57be2e4..e6c7864 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -34,11 +34,6 @@
  #define FOLD_FLAGS_FULL         0x2
  #define FOLD_FLAGS_NOMIX_ASCII  0x4
  
-/* For _core_swash_init(), internal core use only */
-#define _CORE_SWASH_INIT_USER_DEFINED_PROPERTY 0x1
-#define _CORE_SWASH_INIT_RETURN_IF_UNDEF       0x2
-#define _CORE_SWASH_INIT_ACCEPT_INVLIST        0x4
-
  /*
  =head1 Unicode Support
  L<perlguts/Unicode Support> has an introduction to this API.
@@ -74,9 +69,7 @@ the string is invariant.
  #define uvchr_to_utf8_flags_msgs(d,uv,flags,msgs)                              \
                  uvoffuni_to_utf8_flags_msgs(d,NATIVE_TO_UNI(uv),flags, msgs)
  #define utf8_to_uvchr_buf(s, e, lenp)                                          \
-                                (__ASSERT_((U8*) (e) > (U8*) (s))              \
-                                 utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp,  \
-                                    ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY))
+            utf8_to_uvchr_buf_helper((const U8 *) (s), (const U8 *) e, lenp)
  #define utf8n_to_uvchr(s, len, lenp, flags)                                    \
                                  utf8n_to_uvchr_error(s, len, lenp, flags, 0)
  #define utf8n_to_uvchr_error(s, len, lenp, flags, errors)                      \
@@ -115,11 +108,19 @@ the string is invariant.
  #else  /* ! EBCDIC */
  START_EXTERN_C
  
-/* How wide can a single UTF-8 encoded character become in bytes. */
-/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
- * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
- * expressed with 4 bytes.  However, Perl thinks of UTF-8 as a way to encode
- * non-negative integers in a binary format, even those above Unicode */
+/*
+
+=for apidoc AmnU|STRLEN|UTF8_MAXBYTES
+
+The maximum width of a single UTF-8 encoded character, in bytes.
+
+NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
+is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
+expressed with 4 bytes.  However, Perl thinks of UTF-8 as a way to encode
+non-negative integers in a binary format, even those above Unicode.
+
+=cut
+ */
  #define UTF8_MAXBYTES 13
  
  #ifdef DOINIT
@@ -157,8 +158,51 @@ END_EXTERN_C
  #define PERL_SMALL_MACRO_BUFFER
  #endif
  
-/* Native character to/from iso-8859-1.  Are the identity functions on ASCII
- * platforms */
+/*
+
+=for apidoc Am|U8|NATIVE_TO_LATIN1|U8 ch
+
+Returns the Latin-1 (including ASCII and control characters) equivalent of the
+input native code point given by C<ch>.  Thus, C<NATIVE_TO_LATIN1(193)> on
+EBCDIC platforms returns 65.  These each represent the character C<"A"> on
+their respective platforms.  On ASCII platforms no conversion is needed, so
+this macro expands to just its input, adding no time nor space requirements to
+the implementation.
+
+For conversion of code points potentially larger than will fit in a character,
+use L</NATIVE_TO_UNI>.
+
+=for apidoc Am|U8|LATIN1_TO_NATIVE|U8 ch
+
+Returns the native  equivalent of the input Latin-1 code point (including ASCII
+and control characters) given by C<ch>.  Thus, C<LATIN1_TO_NATIVE(66)> on
+EBCDIC platforms returns 194.  These each represent the character C<"B"> on
+their respective platforms.  On ASCII platforms no conversion is needed, so
+this macro expands to just its input, adding no time nor space requirements to
+the implementation.
+
+For conversion of code points potentially larger than will fit in a character,
+use L</UNI_TO_NATIVE>.
+
+=for apidoc Am|UV|NATIVE_TO_UNI|UV ch
+
+Returns the Unicode  equivalent of the input native code point given by C<ch>.
+Thus, C<NATIVE_TO_UNI(195)> on EBCDIC platforms returns 67.  These each
+represent the character C<"C"> on their respective platforms.  On ASCII
+platforms no conversion is needed, so this macro expands to just its input,
+adding no time nor space requirements to the implementation.
+
+=for apidoc Am|UV|UNI_TO_NATIVE|UV ch
+
+Returns the native  equivalent of the input Unicode code point  given by C<ch>.
+Thus, C<UNI_TO_NATIVE(68)> on EBCDIC platforms returns 196.  These each
+represent the character C<"D"> on their respective platforms.  On ASCII
+platforms no conversion is needed, so this macro expands to just its input,
+adding no time nor space requirements to the implementation.
+
+=cut
+*/
+
  #ifdef PERL_SMALL_MACRO_BUFFER
  #define NATIVE_TO_LATIN1(ch)     ((U8)(ch))
  #define LATIN1_TO_NATIVE(ch)     ((U8)(ch))
@@ -181,7 +225,6 @@ END_EXTERN_C
  #define I8_TO_NATIVE_UTF8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) (ch)))
  #endif
  
-/* Transforms in wide UV chars */
  #define UNI_TO_NATIVE(ch)        ((UV) (ch))
  #define NATIVE_TO_UNI(ch)        ((UV) (ch))
  
@@ -240,6 +283,7 @@ platforms.  FF signals to use 13 bytes for the encoded character.  This breaks
  the paradigm that the number of leading bits gives how many total bytes there
  are in the character.
  
+=cut
  */
  
  /* Is the representation of the Unicode code point 'cp' the same regardless of
@@ -503,6 +547,20 @@ only) byte is pointed to by C<s>.
  #define UTF8SKIP(s)  PL_utf8skip[*(const U8*)(s)]
  #define UTF8_SKIP(s) UTF8SKIP(s)
  
+/*
+
+=for apidoc Am|STRLEN|UTF8_SAFE_SKIP|char* s|char* e
+returns 0 if S<C<s E<gt>= e>>; otherwise returns the number of bytes in the
+UTF-8 encoded character whose first  byte is pointed to by C<s>.  But it never
+returns beyond C<e>.  On DEBUGGING builds, it asserts that S<C<s E<lt>= e>>.
+
+=cut
+ */
+#define UTF8_SAFE_SKIP(s, e)  (__ASSERT_((e) >= (s))                \
+                              ((e) - (s)) <= 0                      \
+                               ? 0                                  \
+                               : MIN(((e) - (s)), UTF8_SKIP(s)))
+
  /* Most code that says 'UNI_' really means the native value for code points up
   * through 255 */
  #define UNI_IS_INVARIANT(cp)   UVCHR_IS_INVARIANT(cp)
@@ -793,14 +851,14 @@ fit in an IV on the current machine.
                      && (    NATIVE_UTF8_TO_I8(*(s)) >  0xF9                 \
                          || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2))         \
                      &&  LIKELY((s) + UTF8SKIP(s) <= (e)))                   \
-                    ? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
+                    ?  is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
  #else
  #   define UTF8_IS_SUPER(s, e)                                              \
                     ((    LIKELY((e) > (s) + 3)                              \
                       &&  (*(U8*) (s)) >= 0xF4                               \
                       && ((*(U8*) (s)) >  0xF4 || (*((U8*) (s) + 1) >= 0x90))\
                       &&  LIKELY((s) + UTF8SKIP(s) <= (e)))                  \
-                    ? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
+                    ?  is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
  #endif
  
  /* These are now machine generated, and the 'given' clause is no longer
@@ -817,6 +875,10 @@ of the Unicode non-character code points; otherwise it evaluates to 0.  If
  non-zero, the value gives how many bytes starting at C<s> comprise the code
  point's representation.
  
+=for apidoc AmnU|UV|UNICODE_REPLACEMENT
+
+Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER
+
  =cut
   */
  #define UTF8_IS_NONCHAR(s, e)                                                  \
@@ -967,7 +1029,7 @@ L</is_utf8_string_loclen_flags> to check entire strings.
        ? 1                                                                   \
        : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                 \
          ? 0                                                                 \
-        : _is_utf8_char_helper(s, e, flags))
+        : is_utf8_char_helper(s, e, flags))
  
  /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
   * retained solely for backwards compatibility */