This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Refactor as a result of [perl #123539]
authorKarl Williamson <khw@cpan.org>
Tue, 6 Jan 2015 20:08:06 +0000 (13:08 -0700)
committerKarl Williamson <khw@cpan.org>
Tue, 6 Jan 2015 22:05:26 +0000 (15:05 -0700)
The previous commit was a minimal patch for this bug, suitable for a
maintenance release.  This patch refactors that fix to eliminate special
cases (hence branches) and makes things more logical and clearer.

The original code shoe-horned two cases together that, it turns out,
should have been handled separately, and this patch does that.

The earlier code was somewhat stilted to conform with the reguni() API.
Now that reguni is no longer called, this gets cleaned up.

regcomp.c

index 82f23d3..56e1a25 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -12439,53 +12439,64 @@ tryagain:
                     goto loopdone;
                 }
 
-                if (! FOLD   /* The simple case, just append the literal */
-                    || (LOC  /* Also don't fold for tricky chars under /l */
-                        && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
-                {
-                    if (UTF) {
-
-                        /* Normally, we don't need the representation of the
-                         * character in the sizing pass--just its size, but if
-                         * folding, we have to actually put the character out
-                         * even in the sizing pass, because the size could
-                         * change as we juggle things at the end of this loop
-                         * to avoid splitting a too-full node in the middle of
-                         * a potential multi-char fold [perl #123539] */
-                        const STRLEN unilen = (SIZE_ONLY && ! FOLD)
-                                               ? UNISKIP(ender)
-                                               : (uvchr_to_utf8((U8*)s, ender) - (U8*)s);
-                        if (unilen > 0) {
-                           s   += unilen;
-                           len += unilen;
-                        }
-
-                        /* The loop increments <len> each time, as all but this
-                         * path (and one other) through it add a single byte to
-                         * the EXACTish node.  But this one has changed len to
-                         * be the correct final value, so subtract one to
-                         * cancel out the increment that follows */
-                        len--;
-                    }
-                    else if (FOLD) {
-                        /* See comment above for [perl #123539] */
-                        *(s++) = (char) ender;
-                    }
-                    else {
-                        REGC((char)ender, s++);
-                    }
+                if (! FOLD) {  /* The simple case, just append the literal */
 
-                    /* Can get here if folding only if is one of the /l
-                     * characters whose fold depends on the locale.  The
-                     * occurrence of any of these indicate that we can't
-                     * simplify things */
-                    if (FOLD) {
-                        maybe_exact = FALSE;
-                        maybe_exactfu = FALSE;
+                    /* In the sizing pass, we need only the size of the
+                     * character we are appending, hence we can delay getting
+                     * its representation until PASS2. */
+                    if (SIZE_ONLY) {
+                        if (UTF) {
+                            const STRLEN unilen = UNISKIP(ender);
+                            s += unilen;
+
+                            /* We have to subtract 1 just below (and again in
+                             * the corresponding PASS2 code) because the loop
+                             * increments <len> each time, as all but this path
+                             * (and one other) through it add a single byte to
+                             * the EXACTish node.  But these paths would change
+                             * len to be the correct final value, so cancel out
+                             * the increment that follows */
+                            len += unilen - 1;
+                        }
+                        else {
+                            s++;
+                        }
+                    } else { /* PASS2 */
+                      not_fold_common:
+                        if (UTF) {
+                            U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+                            len += (char *) new_s - s - 1;
+                            s = (char *) new_s;
+                        }
+                        else {
+                            *(s++) = (char) ender;
+                        }
                     }
                 }
-                else             /* FOLD */
-                     if (! ( UTF
+                else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
+
+                    /* Here are folding under /l, and the code point is
+                     * problematic.  First, we know we can't simplify things */
+                    maybe_exact = FALSE;
+                    maybe_exactfu = FALSE;
+
+                    /* A problematic code point in this context means that its
+                     * fold isn't known until runtime, so we can't fold it now.
+                     * (The non-problematic code points are the above-Latin1
+                     * ones that fold to also all above-Latin1.  Their folds
+                     * don't vary no matter what the locale is.) But here we
+                     * have characters whose fold depends on the locale.
+                     * Unlike the non-folding case above, we have to keep track
+                     * of these in the sizing pass, so that we can make sure we
+                     * don't split too-long nodes in the middle of a potential
+                     * multi-char fold.  And unlike the regular fold case
+                     * handled in the else clauses below, we don't actually
+                     * fold and don't have special cases to consider.  What we
+                     * do for both passes is the PASS2 code for non-folding */
+                    goto not_fold_common;
+                }
+                else /* A regular FOLD code point */
+                    if (! ( UTF
                         /* See comments for join_exact() as to why we fold this
                          * non-UTF at compile time */
                         || (node_type == EXACTFU