Note in perlfaq2 that www.perl.com is no longer part of O'Reilly

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 8fd5db9..019d49f 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -42,7 +42,7 @@ PERL_CALLCONV U8*     Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
  
-/* 
+/*
  =head1 Unicode Support
  
  This file contains various utility functions for manipulating UTF8-encoded
@@ -264,7 +264,7 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
         if (!UTF8_IS_CONTINUATION(*s))
             return 0;
         uv = UTF8_ACCUMULATE(uv, *s);
-       if (uv < ouv) 
+       if (uv < ouv)
             return 0;
         ouv = uv;
         s++;
@@ -805,6 +805,75 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
  }
  
  /*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+    const U8 *const bend = b + blen;
+    const U8 *const uend = u + ulen;
+
+    PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+    PERL_UNUSED_CONTEXT;
+
+    while (b < bend && u < uend) {
+        U8 c = *u++;
+       if (!UTF8_IS_INVARIANT(c)) {
+           if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+               if (u < uend) {
+                   U8 c1 = *u++;
+                   if (UTF8_IS_CONTINUATION(c1)) {
+                       c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1);
+                       c = ASCII_TO_NATIVE(c);
+                   } else {
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                                        "Malformed UTF-8 character "
+                                        "(unexpected non-continuation byte 0x%02x"
+                                        ", immediately after start byte 0x%02x)"
+                                        /* Dear diag.t, it's in the pod.  */
+                                        "%s%s", c1, c,
+                                        PL_op ? " in " : "",
+                                        PL_op ? OP_DESC(PL_op) : "");
+                       return -2;
+                   }
+               } else {
+                   if (PL_op)
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                                        "%s in %s", unees, OP_DESC(PL_op));
+                   else
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+                   return -2; /* Really want to return undef :-)  */
+               }
+           } else {
+               return -2;
+           }
+       }
+       if (*b != c) {
+           return *b < c ? -2 : +2;
+       }
+       ++b;
+    }
+
+    if (b == bend && u == uend)
+       return 0;
+
+    return b < bend ? +1 : -1;
+}
+
+/*
  =for apidoc utf8_to_bytes
  
  Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
@@ -2022,6 +2091,105 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      NORETURN_FUNCTION_END;
  }
  
+/* Read a single line of the main body of the swash input text.  These are of
+ * the form:
+ * 0053        0056    0073
+ * where each number is hex.  The first two numbers form the minimum and
+ * maximum of a range, and the third is the value associated with the range.
+ * Not all swashes should have a third number
+ *
+ * On input: l   points to the beginning of the line to be examined; it points
+ *               to somewhere in the string of the whole input text, and is
+ *               terminated by a \n or the null string terminator.
+ *          lend   points to the null terminator of that string
+ *          wants_value    is non-zero if the swash expects a third number
+ *          typestr is the name of the swash's mapping, like 'ToLower'
+ * On output: *min, *max, and *val are set to the values read from the line.
+ *           returns a pointer just beyond the line examined.  If there was no
+ *           valid min number on the line, returns lend+1
+ */
+
+STATIC U8*
+S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
+                            const bool wants_value, const U8* const typestr)
+{
+    const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
+    STRLEN numlen;         /* Length of the number */
+    I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+    /* nl points to the next \n in the scan */
+    U8* const nl = (U8*)memchr(l, '\n', lend - l);
+
+    /* Get the first number on the line: the range minimum */
+    numlen = lend - l;
+    *min = grok_hex((char *)l, &numlen, &flags, NULL);
+    if (numlen)            /* If found a hex number, position past it */
+       l += numlen;
+    else if (nl) {         /* Else, go handle next line, if any */
+       return nl + 1;  /* 1 is length of "\n" */
+    }
+    else {             /* Else, no next line */
+       return lend + 1;        /* to LIST's end at which \n is not found */
+    }
+
+    /* The max range value follows, separated by a BLANK */
+    if (isBLANK(*l)) {
+       ++l;
+       flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+       numlen = lend - l;
+       *max = grok_hex((char *)l, &numlen, &flags, NULL);
+       if (numlen)
+           l += numlen;
+       else    /* If no value here, it is a single element range */
+           *max = *min;
+
+       /* Non-binary tables have a third entry: what the first element of the
+        * range maps to */
+       if (wants_value) {
+           if (isBLANK(*l)) {
+               ++l;
+               flags = PERL_SCAN_SILENT_ILLDIGIT |
+                       PERL_SCAN_DISALLOW_PREFIX;
+               numlen = lend - l;
+               *val = grok_hex((char *)l, &numlen, &flags, NULL);
+               if (numlen)
+                   l += numlen;
+               else
+                   *val = 0;
+           }
+           else {
+               *val = 0;
+               if (typeto) {
+                   Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+                                    typestr, l);
+               }
+           }
+       }
+       else
+           *val = 0; /* bits == 1, then any val should be ignored */
+    }
+    else { /* Nothing following range min, should be single element with no
+             mapping expected */
+       *max = *min;
+       if (wants_value) {
+           *val = 0;
+           if (typeto) {
+               Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+           }
+       }
+       else
+           *val = 0; /* bits == 1, then val should be ignored */
+    }
+
+    /* Position to next line if any, or EOF */
+    if (nl)
+       l = nl + 1;
+    else
+       l = lend;
+
+    return l;
+}
+
  /* Note:
   * Returns a swatch (a bit vector string) for a code point sequence
   * that starts from the value C<start> and comprises the number C<span>.
@@ -2035,13 +2203,15 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      U8 *l, *lend, *x, *xend, *s;
      STRLEN lcur, xcur, scur;
      HV *const hv = MUTABLE_HV(SvRV(swash));
+
+    /* The string containing the main body of the table */
      SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
      SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
      SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
      SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
      SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
      const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
-    const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
      const STRLEN bits  = SvUV(*bitssvp);
      const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
      const UV     none  = SvUV(*nonesvp);
@@ -2088,74 +2258,13 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      lend = l + lcur;
      while (l < lend) {
         UV min, max, val;
-       STRLEN numlen;
-       I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-
-       U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
-       numlen = lend - l;
-       min = grok_hex((char *)l, &numlen, &flags, NULL);
-       if (numlen)
-           l += numlen;
-       else if (nl) {
-           l = nl + 1; /* 1 is length of "\n" */
-           continue;
-       }
-       else {
-           l = lend; /* to LIST's end at which \n is not found */
+       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+                                        cBOOL(octets), typestr);
+       if (l > lend) {
             break;
         }
  
-       if (isBLANK(*l)) {
-           ++l;
-           flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-           numlen = lend - l;
-           max = grok_hex((char *)l, &numlen, &flags, NULL);
-           if (numlen)
-               l += numlen;
-           else
-               max = min;
-
-           if (octets) {
-               if (isBLANK(*l)) {
-                   ++l;
-                   flags = PERL_SCAN_SILENT_ILLDIGIT |
-                           PERL_SCAN_DISALLOW_PREFIX;
-                   numlen = lend - l;
-                   val = grok_hex((char *)l, &numlen, &flags, NULL);
-                   if (numlen)
-                       l += numlen;
-                   else
-                       val = 0;
-               }
-               else {
-                   val = 0;
-                   if (typeto) {
-                       Perl_croak(aTHX_ "%s: illegal mapping '%s'",
-                                        typestr, l);
-                   }
-               }
-           }
-           else
-               val = 0; /* bits == 1, then val should be ignored */
-       }
-       else {
-           max = min;
-           if (octets) {
-               val = 0;
-               if (typeto) {
-                   Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
-               }
-           }
-           else
-               val = 0; /* bits == 1, then val should be ignored */
-       }
-
-       if (nl)
-           l = nl + 1;
-       else
-           l = lend;
-
+       /* If looking for something beyond this range, go try the next one */
         if (max < start)
             continue;
  
@@ -2337,6 +2446,138 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      return swatch;
  }
  
+HV*
+Perl__swash_inversion_hash(pTHX_ SV* swash)
+{
+
+   /* Subject to change or removal.  For use only in one place in regexec.c
+    *
+    * Returns a hash which is the inversion and closure of a swash mapping.
+    * For example, consider the input lines:
+    * 004B             006B
+    * 004C             006C
+    * 212A             006B
+    *
+    * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+    * 006C.  The value for each key is an array.  For 006C, the array would
+    * have a two elements, the utf8 for itself, and for 004C.  For 006B, there
+    * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+    *
+    * Essentially, for any code point, it gives all the code points that map to
+    * it, or the list of 'froms' for that point.
+    *
+    * Currently it only looks at the main body of the swash, and ignores any
+    * additions or deletions from other swashes */
+
+    U8 *l, *lend;
+    STRLEN lcur;
+    HV *const hv = MUTABLE_HV(SvRV(swash));
+
+    /* The string containing the main body of the table */
+    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+    /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
+    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+    const STRLEN bits  = SvUV(*bitssvp);
+    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+    const UV     none  = SvUV(*nonesvp);
+
+    HV* ret = newHV();
+
+    PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
+
+    /* Must have at least 8 bits to get the mappings */
+    if (bits != 8 && bits != 16 && bits != 32) {
+       Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
+                                                (UV)bits);
+    }
+
+    /* read $swash->{LIST} */
+    l = (U8*)SvPV(*listsvp, lcur);
+    lend = l + lcur;
+
+    /* Go through each input line */
+    while (l < lend) {
+       UV min, max, val;
+       UV inverse;
+       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+                                        cBOOL(octets), typestr);
+       if (l > lend) {
+           break;
+       }
+
+       /* Each element in the range is to be inverted */
+       for (inverse = min; inverse <= max; inverse++) {
+           AV* list;
+           SV* element;
+           SV** listp;
+           IV i;
+           bool found_key = FALSE;
+
+           /* The key is the inverse mapping */
+           char key[UTF8_MAXBYTES+1];
+           char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+           STRLEN key_len = key_end - key;
+
+           /* And the value is what the forward mapping is from. */
+           char utf8_inverse[UTF8_MAXBYTES+1];
+           char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse);
+
+           /* Get the list for the map */
+           if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
+               list = (AV*) *listp;
+           }
+           else { /* No entry yet for it: create one */
+               list = newAV();
+               if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
+                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+               }
+           }
+
+           for (i = 0; i < av_len(list); i++) {
+               SV** entryp = av_fetch(list, i, FALSE);
+               SV* entry;
+               if (entryp == NULL) {
+                   Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+               }
+               entry = *entryp;
+               if (SvCUR(entry) != key_len) {
+                   continue;
+               }
+               if (memEQ(key, SvPVX(entry), key_len)) {
+                   found_key = TRUE;
+                   break;
+               }
+           }
+           if (! found_key) {
+               element = newSVpvn_flags(key, key_len, SVf_UTF8);
+               av_push(list, element);
+           }
+
+
+           /* Simply add the value to the list */
+           element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8);
+           av_push(list, element);
+
+           /* swash_get() increments the value of val for each element in the
+            * range.  That makes more compact tables possible.  You can
+            * express the capitalization, for example, of all consecutive
+            * letters with a single line: 0061\t007A\t0041 This maps 0061 to
+            * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
+            * and it's not documented, and perhaps not even currently used,
+            * but I copied the semantics from swash_get(), just in case */
+           if (!none || val < none) {
+               ++val;
+           }
+       }
+    }
+
+    return ret;
+}
+
  /*
  =for apidoc uvchr_to_utf8
  
@@ -2377,7 +2618,7 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  =for apidoc utf8n_to_uvchr
  flags
  
-Returns the native character value of the first character in the string 
+Returns the native character value of the first character in the string
  C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
@@ -2390,7 +2631,7 @@ Allows length and flags to be passed to low level routine.
     a real function in case XS code wants it
  */
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, 
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
  U32 flags)
  {
      const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
@@ -2475,7 +2716,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
      }
      if (truncated)
          sv_catpvs(dsv, "...");
-    
+
      return SvPVX(dsv);
  }
  
@@ -2502,17 +2743,18 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
  }
  
  /*
-=for apidoc ibcmp_utf8
+=for apidoc foldEQ_utf8
  
-Returns true if the strings s1 and s2 differ case-insensitively, false
-if they are equal case-insensitively.  Note that this is the complement of what
-you might expect (perhaps it would have been better to name it C<ibncmp_utf8>).
+Returns true if the leading portions of the strings s1 and s2 (either or both
+of which may be in UTF-8) are the same case-insensitively; false otherwise.
+How far into the strings to compare is determined by other input parameters.
  
  If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
  otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
  with respect to s2.
  
-If the byte length l1 is non-zero, s1+l1 will be used as a goal to reach.  The
+If the byte length l1 is non-zero, it says how far into s1 to check for fold
+equality.  In other words, s1+l1 will be used as a goal to reach.  The
  scan will not be considered to be a match unless the goal is reached, and
  scanning won't continue past that goal.  Correspondingly for l2 with respect to
  s2.
@@ -2521,14 +2763,16 @@ If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
  considered an end pointer beyond which scanning of s1 will not continue under
  any circumstances.  This means that if both l1 and pe1 are specified, and pe1
  is less than s1+l1, the match will never be successful because it can never
-get as far as its goal.  Correspondingly for pe2 with respect to s2.
+get as far as its goal (and in fact is asserted against).  Correspondingly for
+pe2 with respect to s2.
  
-At least one of s1 and s2 must have a goal, and if both do, both have to be
+At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
+non-zero), and if both do, both have to be
  reached for a successful match.   Also, if the fold of a character is multiple
  characters, all of them must be matched (see tr21 reference below for
  'folding').
  
-Upon a successful match (when the routine returns false), if pe1 is non-NULL,
+Upon a successful match, if pe1 is non-NULL,
  it will be set to point to the beginning of the I<next> character of s1 beyond
  what was matched.  Correspondingly for pe2 and s2.
  
@@ -2538,39 +2782,39 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
  
  =cut */
  I32
-Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
+Perl_foldEQ_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
  {
      dVAR;
      register const U8 *p1  = (const U8*)s1; /* Point to current char */
      register const U8 *p2  = (const U8*)s2;
-    register const U8 *g1 = NULL;      /* goal for s1 */
+    register const U8 *g1 = NULL;       /* goal for s1 */
      register const U8 *g2 = NULL;
-    register const U8 *e1 = NULL;      /* Don't scan s1 past this */
-    register U8 *f1 = NULL;            /* Point to current folded */
+    register const U8 *e1 = NULL;       /* Don't scan s1 past this */
+    register U8 *f1 = NULL;             /* Point to current folded */
      register const U8 *e2 = NULL;
      register U8 *f2 = NULL;
-    STRLEN n1 = 0, n2 = 0;             /* Number of bytes in current char */
+    STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
-    U8 natbuf[2];              /* Holds native 8-bit char converted to utf8;
-                                  these always fit in 2 bytes */
+    U8 natbuf[2];               /* Holds native 8-bit char converted to utf8;
+                                   these always fit in 2 bytes */
  
-    PERL_ARGS_ASSERT_IBCMP_UTF8;
+    PERL_ARGS_ASSERT_FOLDEQ_UTF8;
  
      if (pe1) {
-       e1 = *(U8**)pe1;
+        e1 = *(U8**)pe1;
      }
  
      if (l1) {
-       g1 = (const U8*)s1 + l1;
+        g1 = (const U8*)s1 + l1;
      }
  
      if (pe2) {
-       e2 = *(U8**)pe2;
+        e2 = *(U8**)pe2;
      }
  
      if (l2) {
-       g2 = (const U8*)s2 + l2;
+        g2 = (const U8*)s2 + l2;
      }
  
      /* Must have at least one goal */
@@ -2578,75 +2822,80 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
  
      if (g1) {
  
-       /* Will never match if goal is out-of-bounds */
-       assert(! e1  || e1 >= g1);
+        /* Will never match if goal is out-of-bounds */
+        assert(! e1  || e1 >= g1);
  
-       /* Here, there isn't an end pointer, or it is beyond the goal.  We
-       * only go as far as the goal */
-       e1 = g1;
+        /* Here, there isn't an end pointer, or it is beyond the goal.  We
+        * only go as far as the goal */
+        e1 = g1;
+    }
+    else {
+       assert(e1);    /* Must have an end for looking at s1 */
      }
-    else assert(e1);   /* Must have an end for looking at s1 */
  
      /* Same for goal for s2 */
      if (g2) {
-       assert(! e2  || e2 >= g2);
-       e2 = g2;
+        assert(! e2  || e2 >= g2);
+        e2 = g2;
+    }
+    else {
+       assert(e2);
      }
-    else assert(e2);
  
      /* Look through both strings, a character at a time */
      while (p1 < e1 && p2 < e2) {
  
-       /* If at the beginning of a new character in s1, get its fold to use */
-       if (n1 == 0) {
-           if (u1) {
-               to_utf8_fold(p1, foldbuf1, &n1);
-           }
-           else {  /* Not utf8, convert to it first and then get fold */
-               uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
-               to_utf8_fold(natbuf, foldbuf1, &n1);
-           }
-           f1 = foldbuf1;
-       }
-
-       if (n2 == 0) {    /* Same for s2 */
-           if (u2) {
-               to_utf8_fold(p2, foldbuf2, &n2);
-           }
-           else {
-               uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
-               to_utf8_fold(natbuf, foldbuf2, &n2);
-           }
-           f2 = foldbuf2;
-       }
+        /* If at the beginning of a new character in s1, get its fold to use
+         * and the length of the fold */
+        if (n1 == 0) {
+            if (u1) {
+                to_utf8_fold(p1, foldbuf1, &n1);
+            }
+            else {  /* Not utf8, convert to it first and then get fold */
+                uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+                to_utf8_fold(natbuf, foldbuf1, &n1);
+            }
+            f1 = foldbuf1;
+        }
  
-       /* While there is more to look for in both folds, see if they
-       * continue to match */
-       while (n1 && n2) {
-           U8 fold_length = UTF8SKIP(f1);
-           if (fold_length != UTF8SKIP(f2)
-               || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
-                                                      function call for single
-                                                      character */
-               || memNE((char*)f1, (char*)f2, fold_length))
-           {
-               return 1; /* mismatch */
-           }
+        if (n2 == 0) {    /* Same for s2 */
+            if (u2) {
+                to_utf8_fold(p2, foldbuf2, &n2);
+            }
+            else {
+                uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+                to_utf8_fold(natbuf, foldbuf2, &n2);
+            }
+            f2 = foldbuf2;
+        }
  
-           /* Here, they matched, advance past them */
-           n1 -= fold_length;
-           f1 += fold_length;
-           n2 -= fold_length;
-           f2 += fold_length;
-       }
+        /* While there is more to look for in both folds, see if they
+        * continue to match */
+        while (n1 && n2) {
+            U8 fold_length = UTF8SKIP(f1);
+            if (fold_length != UTF8SKIP(f2)
+                || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+                                                       function call for single
+                                                       character */
+                || memNE((char*)f1, (char*)f2, fold_length))
+            {
+                return 0; /* mismatch */
+            }
+
+            /* Here, they matched, advance past them */
+            n1 -= fold_length;
+            f1 += fold_length;
+            n2 -= fold_length;
+            f2 += fold_length;
+        }
  
-       /* When reach the end of any fold, advance the input past it */
-       if (n1 == 0) {
-           p1 += u1 ? UTF8SKIP(p1) : 1;
-       }
-       if (n2 == 0) {
-           p2 += u2 ? UTF8SKIP(p2) : 1;
-       }
+        /* When reach the end of any fold, advance the input past it */
+        if (n1 == 0) {
+            p1 += u1 ? UTF8SKIP(p1) : 1;
+        }
+        if (n2 == 0) {
+            p2 += u2 ? UTF8SKIP(p2) : 1;
+        }
      } /* End of loop through both strings */
  
      /* A match is defined by each scan that specified an explicit length
@@ -2654,17 +2903,17 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
      * character (which can happen when the fold of a character is more than one
      * character). */
      if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
-       return 1;
+        return 0;
      }
  
      /* Successful match.  Set output pointers */
      if (pe1) {
-       *pe1 = (char*)p1;
+        *pe1 = (char*)p1;
      }
      if (pe2) {
-       *pe2 = (char*)p2;
+        *pe2 = (char*)p2;
      }
-    return 0;
+    return 1;
  }
  
  /*