Update Locale-Codes to CPAN version 3.35

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 57b1580..c3f0200 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -32,7 +32,6 @@
  #define PERL_IN_UTF8_C
  #include "perl.h"
  #include "inline_invlist.c"
-#include "charclass_invlists.h"
  
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
@@ -95,7 +94,7 @@ This function is like them, but the input is a strict Unicode
  (as opposed to native) code point.  Only in very rare circumstances should code
  not be using the native code point.
  
-For details, see the description for L</uvchr_to_utf8_flags>>.
+For details, see the description for L</uvchr_to_utf8_flags>.
  
  =cut
  */
@@ -140,7 +139,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
             {
  #ifdef EBCDIC
                  Perl_die(aTHX_ "Can't represent character for Ox%"UVXf" on this platform", uv);
-                NOT_REACHED;
+                NOT_REACHED; /* NOTREACHED */
  #endif
                 return NULL;
             }
@@ -241,7 +240,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  =for apidoc uvchr_to_utf8
  
  Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
  C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
  the byte after the end of the new character.  In other words,
  
@@ -270,7 +269,7 @@ Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
  =for apidoc uvchr_to_utf8_flags
  
  Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
  C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
  the byte after the end of the new character.  In other words,
  
@@ -319,22 +318,6 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  }
  
  /*
-=for apidoc is_utf8_char_buf
-
-This is identical to the macro L</isUTF8_CHAR>.
-
-=cut */
-
-STRLEN
-Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
-{
-
-    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
-
-    return isUTF8_CHAR(buf, buf_end);
-}
-
-/*
  =for apidoc is_utf8_string
  
  Returns true if the first C<len> bytes of string C<s> form a valid
@@ -806,13 +789,13 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
       *     is the label <malformed>.
       */
  
-malformed:
+  malformed:
  
      if (sv && ckWARN_d(WARN_UTF8)) {
         pack_warn = packWARN(WARN_UTF8);
      }
  
-disallowed:
+  disallowed:
  
      if (flags & UTF8_CHECK_ONLY) {
         if (retlen)
@@ -820,7 +803,7 @@ disallowed:
         return 0;
      }
  
-do_warn:
+  do_warn:
  
      if (pack_warn) {   /* <pack_warn> was initialized to 0, and changed only
                            if warnings are to be raised. */
@@ -1294,19 +1277,26 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  #define LAST_HIGH_SURROGATE  0xDBFF
  #define FIRST_LOW_SURROGATE  0xDC00
  #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
-       if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
-           if (p >= pend) {
-               Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
-           } else {
+
+        /* This assumes that most uses will be in the first Unicode plane, not
+         * needing surrogates */
+       if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
+                  && uv <= UNICODE_SURROGATE_LAST))
+        {
+            if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
+                Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+            }
+           else {
                 UV low = (p[0] << 8) + p[1];
-               p += 2;
-               if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
+               if (   UNLIKELY(low < FIRST_LOW_SURROGATE)
+                    || UNLIKELY(low > LAST_LOW_SURROGATE))
+                {
                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+                }
+               p += 2;
                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
                                         + (low - FIRST_LOW_SURROGATE) + 0x10000;
             }
-       } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
-           Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
         }
  #ifdef EBCDIC
          d = uvoffuni_to_utf8_flags(d, uv, 0);
@@ -1914,11 +1904,12 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
             s += UTF8SKIP(s);
         }
  
-       /* Here, no characters crossed, result is ok as-is */
+        /* Here, no characters crossed, result is ok as-is, but we warn. */
+        _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
         return result;
      }
  
-bad_crossing:
+  bad_crossing:
  
      /* Failed, have to return the original */
      original = valid_utf8_to_uvchr(p, lenp);
@@ -2416,6 +2407,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         PUSHSTACKi(PERLSI_MAGIC);
         ENTER;
         SAVEHINTS();
+       save_re_context();
         /* We might get here via a subroutine signature which uses a utf8
          * parameter name, at which point PL_subname will have been set
          * but not yet used. */
@@ -2430,6 +2422,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
  #ifndef NO_TAINT_SUPPORT
             /* It is assumed that callers of this routine are not passing in
              * any user derived data.  */
+           /* Need to do this after save_re_context() as it will set
+            * PL_tainted to 1 while saving $1 etc (see the code after getrx:
+            * in Perl_magic_get).  Even line to create errsv_save can turn on
+            * PL_tainted.  */
             SAVEBOOL(TAINT_get);
             TAINT_NOT;
  #endif
@@ -3558,7 +3554,9 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
  
          /* The first number is a count of the rest */
          l++;
-        elements = grok_atou((const char *)l, &after_atou);
+        if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
+            Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
+        }
          if (elements == 0) {
              invlist = _new_invlist(0);
          }
@@ -3568,7 +3566,9 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
  
              /* Get the 0th element, which is needed to setup the inversion list */
              while (isSPACE(*l)) l++;
-            element0 = (UV) grok_atou((const char *)l, &after_atou);
+            if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
+                Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
+            }
              l = (U8 *) after_atou;
              invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
              elements--;
@@ -3579,7 +3579,9 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
                      Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
                  }
                  while (isSPACE(*l)) l++;
-                *other_elements_ptr++ = (UV) grok_atou((const char *)l, &after_atou);
+                if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
+                    Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
+                }
                  l = (U8 *) after_atou;
              }
          }
@@ -3798,6 +3800,8 @@ UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
  
  The pointer to the PV of the C<dsv> is returned.
  
+See also L</sv_uni_display>.
+
  =cut */
  char *
  Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
@@ -3941,7 +3945,15 @@ L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
   *                          routine.  This allows that step to be skipped.
   *                          Currently, this requires s1 to be encoded as UTF-8
   *                          (u1 must be true), which is asserted for.
+ *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
+ *                          cross certain boundaries.  Hence, the caller should
+ *                          let this function do the folding instead of
+ *                          pre-folding.  This code contains an assertion to
+ *                          that effect.  However, if the caller knows what
+ *                          it's doing, it can pass this flag to indicate that,
+ *                          and the assertion is skipped.
   *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
+ *  FOLDEQ_S2_FOLDS_SANE
   */
  I32
  Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
@@ -3962,7 +3974,10 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
      assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
-           && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+               && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
+                     && !(flags & FOLDEQ_S1_FOLDS_SANE))
+                   || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
+                       && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
      /* The algorithm is to trial the folds without regard to the flags on
       * the first line of the above assert(), and then see if the result
       * violates them.  This means that the inputs can't be pre-folded to a
@@ -4179,7 +4194,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  =for apidoc uvuni_to_utf8_flags
  
  Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+L</uvchr_to_utf8_flags>.
  
  This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
  which itself, while not deprecated, should be used only in isolated
@@ -4200,11 +4215,5 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  }
  
  /*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
   * ex: set ts=8 sts=4 sw=4 et:
   */