This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Resolve EBCDIC inconsistency towards simpler
authorKarl Williamson <public@khwilliamson.com>
Sun, 27 May 2012 04:19:22 +0000 (22:19 -0600)
committerKarl Williamson <public@khwilliamson.com>
Thu, 2 Aug 2012 15:24:51 +0000 (09:24 -0600)
This code has assumed that to_uni_fold() returns its folds in Unicode
(i.e.  Latin1) rather than native EBCDIC.  Other code in the core
assumes the opposite.  One has to change.  I'm changing this one, as the
issues should be dealt with at the lowest level possible, which is in
to_uni_fold().  Since we don't currently have an EBCDIC platform to test
on, making sure that it all hangs together will have to be deferred
until such time as we do.

By doing this we make this code simpler and faster.  The fold has
already been calculated, we just need to copy it to the final place
(done in pass2).

regcomp.c

index 807bc71..d09d89f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10676,29 +10676,13 @@ tryagain:
                    }
                }
                 if (UTF || is_exactfu_sharp_s) {
                    }
                }
                 if (UTF || is_exactfu_sharp_s) {
-                    if (FOLD) {
-                         /* Emit all the Unicode characters. */
-                         STRLEN numlen;
-                         for (foldbuf = tmpbuf;
-                              foldlen;
-                              foldlen -= numlen) {
-
-                               /* tmpbuf has been constructed by us, so we know
-                                * it is valid utf8 */
-                              ender = valid_utf8_to_uvchr(foldbuf, &numlen);
-                              if (numlen > 0) {
-                                   const STRLEN unilen = reguni(pRExC_state, ender, s);
-                                   len     += unilen;
-                                   s       += unilen;
-                                   /* In EBCDIC the numlen
-                                    * and unilen can differ. */
-                                   foldbuf += numlen;
-                                   if (numlen >= foldlen)
-                                        break; /* "Can't happen." */
-                              }
-                              else
-                                   break;
-                         }
+                    if (FOLD) {
+                        if (! SIZE_ONLY) {
+                            /* Emit all the Unicode characters. */
+                            Copy(tmpbuf, s, foldlen, char);
+                        }
+                        len += foldlen;
+                        s += foldlen;
                     }
                     else {
                          const STRLEN unilen = reguni(pRExC_state, ender, s);
                     }
                     else {
                          const STRLEN unilen = reguni(pRExC_state, ender, s);