(perl #108276) add wrappers for deferred op processing

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index cba1523..fc4a0c1 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -42,8 +42,6 @@ static const char cp_above_legal_max[] =
                          "Use of code point 0x%" UVXf " is not allowed; the"
                          " permissible max is 0x%" UVXf;
  
-#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
-
  /*
  =head1 Unicode Support
  These are various utility functions for manipulating UTF8-encoded
@@ -309,8 +307,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
-        if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
-            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+        if (UNLIKELY(uv > MAX_LEGAL_CP)) {
+            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
          }
          if (       (flags & UNICODE_WARN_SUPER)
              || (   (flags & UNICODE_WARN_PERL_EXTENDED)
@@ -2649,7 +2647,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
      PERL_ARGS_ASSERT_BYTES_TO_UTF8;
      PERL_UNUSED_CONTEXT;
  
-    Newx(d, (*lenp) * 2 + 1, U8);
+    /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
      dst = d;
  
      while (s < send) {
@@ -2660,9 +2659,6 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
      *d = '\0';
      *lenp = d-dst;
  
-    /* Trim unused space */
-    Renew(dst, *lenp + 1, U8);
-
      return dst;
  }
  
@@ -3188,14 +3184,14 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name,
  
              if (instr(file, "mathoms.c")) {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
-                            "In %s, line %d, starting in Perl v5.30, %s()"
+                            "In %s, line %d, starting in Perl v5.32, %s()"
                              " will be removed.  Avoid this message by"
                              " converting to use %s().\n",
                              file, line, name, alternative);
              }
              else {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
-                            "In %s, line %d, starting in Perl v5.30, %s() will"
+                            "In %s, line %d, starting in Perl v5.32, %s() will"
                              " require an additional parameter.  Avoid this"
                              " message by converting to use %s().\n",
                              file, line, name, alternative);
@@ -3428,9 +3424,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
-                    if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
+                    if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
                          Perl_croak(aTHX_ cp_above_legal_max, uv1,
-                                         MAX_EXTERNALLY_LEGAL_CP);
+                                         MAX_LEGAL_CP);
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
@@ -3952,7 +3948,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
  
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
-            if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
+            if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3962,7 +3958,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
              }
              else
  #endif
-                 if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
+                 if (memBEGINs((char *) p, e - p, LONG_S_T))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3981,7 +3977,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
               * 255/256 boundary which is forbidden under /l, and so the code
               * wouldn't catch that they are equivalent (which they are only in
               * this release) */
-            else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
+            else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
                                "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
@@ -4063,7 +4059,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
       * works. */
  
      *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
-    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
          ustrp, *lenp, U8);
      return LATIN_SMALL_LETTER_LONG_S;
  
@@ -5475,7 +5471,13 @@ L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
   *                          that effect.  However, if the caller knows what
   *                          it's doing, it can pass this flag to indicate that,
   *                          and the assertion is skipped.
- *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
+ *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ *                          to s2, and s2 doesn't have to be UTF-8 encoded.
+ *                          This introduces an asymmetry to save a few branches
+ *                          in a loop.  Currently, this is not a problem, as
+ *                          never are both inputs pre-folded.  Simply call this
+ *                          function with the pre-folded one as the second
+ *                          string.
   *  FOLDEQ_S2_FOLDS_SANE
   */
  I32
@@ -5498,11 +5500,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
-    assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
-               && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
-                     && !(flags & FOLDEQ_S1_FOLDS_SANE))
-                   || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
-                       && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+    assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+               && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
+                        && !(flags &  FOLDEQ_S1_FOLDS_SANE))
+                    || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
+                        && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
      /* The algorithm is to trial the folds without regard to the flags on
       * the first line of the above assert(), and then see if the result
       * violates them.  This means that the inputs can't be pre-folded to a
@@ -5522,6 +5524,9 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
              flags_for_folder |= FOLD_FLAGS_LOCALE;
          }
      }
+    if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+        flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+    }
  
      if (pe1) {
          e1 = *(U8**)pe1;
@@ -5606,9 +5611,23 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
  
          if (n2 == 0) {    /* Same for s2 */
             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
-               f2 = (U8 *) p2;
-                assert(u2);
-               n2 = UTF8SKIP(f2);
+
+                /* Point to the already-folded character.  But for non-UTF-8
+                 * variants, convert to UTF-8 for the algorithm below */
+               if (UTF8_IS_INVARIANT(*p2)) {
+                    f2 = (U8 *) p2;
+                    n2 = 1;
+                }
+                else if (u2) {
+                    f2 = (U8 *) p2;
+                    n2 = UTF8SKIP(f2);
+                }
+                else {
+                    foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+                    foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+                    f2 = foldbuf2;
+                    n2 = 2;
+                }
             }
             else {
                  if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {