This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update CPAN-Meta to CPAN version 2.130880
[perl5.git] / regcomp.c
index 53268fd..7293a57 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1456,7 +1456,7 @@ is the recommended Unicode-aware way of saying
            len = 0;                                                                     \
         } else {                                                                        \
             len = 1;                                                                    \
-            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
+            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);       \
             skiplen = UNISKIP(uvc);                                                     \
             foldlen -= skiplen;                                                         \
             scan = foldbuf + skiplen;                                                   \
@@ -2686,29 +2686,37 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
  *      described in the next item.
- * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
- *      'ss' or not is not knowable at compile time.  It will match iff the
- *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
- *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
- *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
- *      described in item 3).  An assumption that the optimizer part of
- *      regexec.c (probably unwittingly) makes is that a character in the
- *      pattern corresponds to at most a single character in the target string.
- *      (And I do mean character, and not byte here, unlike other parts of the
- *      documentation that have never been updated to account for multibyte
- *      Unicode.)  This assumption is wrong only in this case, as all other
- *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
- *      virtue of having this file pre-fold UTF-8 patterns.   I'm
- *      reluctant to try to change this assumption, so instead the code punts.
- *      This routine examines EXACTF nodes for the sharp s, and returns a
- *      boolean indicating whether or not the node is an EXACTF node that
- *      contains a sharp s.  When it is true, the caller sets a flag that later
- *      causes the optimizer in this file to not set values for the floating
- *      and fixed string lengths, and thus avoids the optimizer code in
- *      regexec.c that makes the invalid assumption.  Thus, there is no
- *      optimization based on string lengths for EXACTF nodes that contain the
- *      sharp s.  This only happens for /id rules (which means the pattern
- *      isn't in UTF-8).
+ * 4)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
+ *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
+ *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
+ *      character in the pattern corresponds to at most a single character in
+ *      the target string.  (And I do mean character, and not byte here, unlike
+ *      other parts of the documentation that have never been updated to
+ *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
+ *      two character string 'ss'; in EXACTFA nodes it can match
+ *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
+ *      instances where it is violated.  I'm reluctant to try to change the
+ *      assumption, as the code involved is impenetrable to me (khw), so
+ *      instead the code here punts.  This routine examines (when the pattern
+ *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
+ *      boolean indicating whether or not the node contains a sharp s.  When it
+ *      is true, the caller sets a flag that later causes the optimizer in this
+ *      file to not set values for the floating and fixed string lengths, and
+ *      thus avoids the optimizer code in regexec.c that makes the invalid
+ *      assumption.  Thus, there is no optimization based on string lengths for
+ *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
+ *      (The reason the assumption is wrong only in these two cases is that all
+ *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
+ *      other folds to their expanded versions.  We can't prefold sharp s to
+ *      'ss' in EXACTF nodes because we don't know at compile time if it
+ *      actually matches 'ss' or not.  It will match iff the target string is
+ *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
+ *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
+ *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
+ *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
+ *      require the pattern to be forced into UTF-8, the overhead of which we
+ *      want to avoid.)
  */
 
 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
@@ -2838,7 +2846,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                     OP(scan) = EXACTFU_SS;
                     s += 2;
                 }
-                else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
+                else if (len == 6   /* len is the same in both ASCII and EBCDIC
+                                       for these */
                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
                                       COMBINING_DIAERESIS_UTF8
                                       COMBINING_ACUTE_ACCENT_UTF8,
@@ -2898,13 +2907,30 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
             next_iteration: ;
            }
        }
-       else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+       else if (OP(scan) == EXACTFA) {
 
-            /* Here, the pattern is not UTF-8.  Look for the multi-char folds
-             * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
-             * nodes can't have multi-char folds to this range (and there are
-             * no existing ones in the upper latin1 range).  In the EXACTF
-             * case we look also for the sharp s, which can be in the final
+            /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
+             * fold to the ASCII range (and there are no existing ones in the
+             * upper latin1 range).  But, as outlined in the comments preceding
+             * this function, we need to flag any occurrences of the sharp s */
+           while (s < s_end) {
+                if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+                    *has_exactf_sharp_s = TRUE;
+                    break;
+                }
+                s++;
+                continue;
+            }
+        }
+       else if (OP(scan) != EXACTFL) {
+
+            /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
+             * multi-char folds that are all Latin1.  (This code knows that
+             * there are no current multi-char folds possible with EXACTFL,
+             * relying on fold_grind.t to catch any errors if the very unlikely
+             * event happens that some get added in future Unicode versions.)
+             * As explained in the comments preceding this function, we look
+             * also for the sharp s in EXACTF nodes; it can be in the final
              * position.  Otherwise we can stop looking 1 byte earlier because
              * have to find at least two characters for a multi-fold */
            const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
@@ -8747,8 +8773,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            }
            nextchar(pRExC_state);
            return ret;
-        } else 
-       if (*RExC_parse == '?') { /* (?...) */
+        }
+        else if (*RExC_parse == '?') { /* (?...) */
            bool is_logical = 0;
            const char * const seqstart = RExC_parse;
             if (has_intervening_patws && SIZE_ONLY) {
@@ -10647,8 +10673,14 @@ tryagain:
                        goto parse_named_seq;
                }   }
                num = atoi(RExC_parse);
-               if (isg && num == 0)
-                   vFAIL("Reference to invalid group 0");
+               if (isg && num == 0) {
+                   if (*RExC_parse == '0') {
+                        vFAIL("Reference to invalid group 0");
+                    }
+                    else {
+                       vFAIL("Unterminated \\g... pattern");
+                    }
+                }
                 if (isrel) {
                     num = RExC_npar - num;
                     if (num < 1)
@@ -10661,8 +10693,6 @@ tryagain:
                    char * const parse_start = RExC_parse - 1; /* MJD */
                    while (isDIGIT(*RExC_parse))
                        RExC_parse++;
-                   if (parse_start == RExC_parse - 1) 
-                       vFAIL("Unterminated \\g... pattern");
                     if (hasbrace) {
                         if (*RExC_parse != '}') 
                             vFAIL("Unterminated \\g{...} pattern");
@@ -11028,98 +11058,100 @@ tryagain:
                     goto loopdone;
                 }
 
-               if (FOLD) {
-                    if (UTF
-                            /* See comments for join_exact() as to why we fold
-                             * this non-UTF at compile time */
-                        || (node_type == EXACTFU
-                            && ender == LATIN_SMALL_LETTER_SHARP_S))
-                    {
-
-
-                        /* Prime the casefolded buffer.  Locale rules, which
-                         * apply only to code points < 256, aren't known until
-                         * execution, so for them, just output the original
-                         * character using utf8.  If we start to fold non-UTF
-                         * patterns, be sure to update join_exact() */
-                        if (LOC && ender < 256) {
-                            if (UNI_IS_INVARIANT(ender)) {
-                                *s = (U8) ender;
-                                foldlen = 1;
-                            } else {
-                                *s = UTF8_TWO_BYTE_HI(ender);
-                                *(s + 1) = UTF8_TWO_BYTE_LO(ender);
-                                foldlen = 2;
-                            }
+                if (! FOLD) {
+                    if (UTF) {
+                        const STRLEN unilen = reguni(pRExC_state, ender, s);
+                        if (unilen > 0) {
+                           s   += unilen;
+                           len += unilen;
                         }
-                        else {
-                            UV folded = _to_uni_fold_flags(
-                                           ender,
-                                           (U8 *) s,
-                                           &foldlen,
-                                           FOLD_FLAGS_FULL
-                                           | ((LOC) ?  FOLD_FLAGS_LOCALE
-                                                    : (ASCII_FOLD_RESTRICTED)
-                                                      ? FOLD_FLAGS_NOMIX_ASCII
-                                                      : 0)
-                                            );
 
-                            /* If this node only contains non-folding code
-                             * points so far, see if this new one is also
-                             * non-folding */
-                            if (maybe_exact) {
-                                if (folded != ender) {
-                                    maybe_exact = FALSE;
+                        /* The loop increments <len> each time, as all but this
+                         * path (and one other) through it add a single byte to
+                         * the EXACTish node.  But this one has changed len to
+                         * be the correct final value, so subtract one to
+                         * cancel out the increment that follows */
+                        len--;
+                    }
+                    else {
+                        REGC((char)ender, s++);
+                    }
+                }
+                else /* FOLD */
+                     if (! ( UTF
+                        /* See comments for join_exact() as to why we fold this
+                         * non-UTF at compile time */
+                        || (node_type == EXACTFU
+                            && ender == LATIN_SMALL_LETTER_SHARP_S)))
+                {
+                    *(s++) = (char) ender;
+                    maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+                }
+                else {  /* UTF */
+
+                    /* Prime the casefolded buffer.  Locale rules, which apply
+                     * only to code points < 256, aren't known until execution,
+                     * so for them, just output the original character using
+                     * utf8.  If we start to fold non-UTF patterns, be sure to
+                     * update join_exact() */
+                    if (LOC && ender < 256) {
+                        if (UNI_IS_INVARIANT(ender)) {
+                            *s = (U8) ender;
+                            foldlen = 1;
+                        } else {
+                            *s = UTF8_TWO_BYTE_HI(ender);
+                            *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+                            foldlen = 2;
+                        }
+                    }
+                    else {
+                        UV folded = _to_uni_fold_flags(
+                                       ender,
+                                       (U8 *) s,
+                                       &foldlen,
+                                       FOLD_FLAGS_FULL
+                                       | ((LOC) ?  FOLD_FLAGS_LOCALE
+                                                : (ASCII_FOLD_RESTRICTED)
+                                                  ? FOLD_FLAGS_NOMIX_ASCII
+                                                  : 0)
+                                        );
+
+                        /* If this node only contains non-folding code points
+                         * so far, see if this new one is also non-folding */
+                        if (maybe_exact) {
+                            if (folded != ender) {
+                                maybe_exact = FALSE;
+                            }
+                            else {
+                                /* Here the fold is the original; we have
+                                 * to check further to see if anything
+                                 * folds to it */
+                                if (! PL_utf8_foldable) {
+                                    SV* swash = swash_init("utf8",
+                                                       "_Perl_Any_Folds",
+                                                       &PL_sv_undef, 1, 0);
+                                    PL_utf8_foldable =
+                                                _get_swash_invlist(swash);
+                                    SvREFCNT_dec_NN(swash);
                                 }
-                                else {
-                                    /* Here the fold is the original; we have
-                                     * to check further to see if anything
-                                     * folds to it */
-                                    if (! PL_utf8_foldable) {
-                                        SV* swash = swash_init("utf8",
-                                                           "_Perl_Any_Folds",
-                                                           &PL_sv_undef, 1, 0);
-                                        PL_utf8_foldable =
-                                                    _get_swash_invlist(swash);
-                                        SvREFCNT_dec_NN(swash);
-                                    }
-                                    if (_invlist_contains_cp(PL_utf8_foldable,
-                                                             ender))
-                                    {
-                                        maybe_exact = FALSE;
-                                    }
+                                if (_invlist_contains_cp(PL_utf8_foldable,
+                                                         ender))
+                                {
+                                    maybe_exact = FALSE;
                                 }
                             }
-                            ender = folded;
                         }
-                       s += foldlen;
-
-                       /* The loop increments <len> each time, as all but this
-                        * path (and the one just below for UTF) through it add
-                        * a single byte to the EXACTish node.  But this one
-                        * has changed len to be the correct final value, so
-                        * subtract one to cancel out the increment that
-                        * follows */
-                       len += foldlen - 1;
-                    }
-                    else {
-                        *(s++) = (char) ender;
-                        maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+                        ender = folded;
                     }
+                    s += foldlen;
+
+                    /* The loop increments <len> each time, as all but this
+                     * path (and one other) through it add a single byte to the
+                     * EXACTish node.  But this one has changed len to be the
+                     * correct final value, so subtract one to cancel out the
+                     * increment that follows */
+                    len += foldlen - 1;
                }
-               else if (UTF) {
-                    const STRLEN unilen = reguni(pRExC_state, ender, s);
-                    if (unilen > 0) {
-                       s   += unilen;
-                       len += unilen;
-                    }
-
-                   /* See comment just above for - 1 */
-                   len--;
-               }
-               else {
-                   REGC((char)ender, s++);
-                }
 
                if (next_is_quantifier) {
 
@@ -13026,17 +13058,17 @@ parseit:
                         /* <multi_char_matches> is actually an array of arrays.
                          * There will be one or two top-level elements: [2],
                          * and/or [3].  The [2] element is an array, each
-                         * element thereof is a character which folds to two
-                         * characters; likewise for [3].  (Unicode guarantees a
-                         * maximum of 3 characters in any fold.)  When we
-                         * rewrite the character class below, we will do so
-                         * such that the longest folds are written first, so
-                         * that it prefers the longest matching strings first.
-                         * This is done even if it turns out that any
-                         * quantifier is non-greedy, out of programmer
-                         * laziness.  Tom Christiansen has agreed that this is
-                         * ok.  This makes the test for the ligature 'ffi' come
-                         * before the test for 'ff' */
+                         * element thereof is a character which folds to TWO
+                         * characters; [3] is for folds to THREE characters.
+                         * (Unicode guarantees a maximum of 3 characters in any
+                         * fold.)  When we rewrite the character class below,
+                         * we will do so such that the longest folds are
+                         * written first, so that it prefers the longest
+                         * matching strings first.  This is done even if it
+                         * turns out that any quantifier is non-greedy, out of
+                         * programmer laziness.  Tom Christiansen has agreed
+                         * that this is ok.  This makes the test for the
+                         * ligature 'ffi' come before the test for 'ff' */
                         if (av_exists(multi_char_matches, cp_count)) {
                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
                                                              cp_count, FALSE);
@@ -13540,8 +13572,7 @@ parseit:
                          * doesn't allow them between above and below 256 */
                         if ((ASCII_FOLD_RESTRICTED
                                   && (isASCII(c) != isASCII(j)))
-                            || (LOC && ((c < 256) != (j < 256))))
-                        {
+                            || (LOC && c < 256)) {
                             continue;
                         }