This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Correct INSTALL to warn about editing cflags, not cflags.SH
[perl5.git] / regcomp.c
index e3da6e9..b106cc1 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3325,11 +3325,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                             if ( noper_trietype
                                   &&
                                   (
-                                        ( noper_trietype == NOTHING )
-                                        ||
-                                        ( trietype == NOTHING )
-                                        ||
-                                        ( trietype == noper_trietype )
+                                        /* XXX: Currently we cannot allow a NOTHING node to be the first element
+                                         * of a TRIEABLE sequence, Otherwise we will overwrite the regop following
+                                         * the NOTHING with the TRIE regop later on. This is because a NOTHING node
+                                         * is only one regnode wide, and a TRIE is two regnodes. An example of a
+                                         * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/
+                                         * At a later point of time we can somewhat workaround this by handling
+                                         * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns,
+                                         * as we can effectively ignore the NOTHING regop in that case.
+                                         * This clause, which allows NOTHING to start a sequence is left commented
+                                         * out as a reference.
+                                         * - Yves
+
+                                           ( noper_trietype == NOTHING)
+                                           || ( trietype == NOTHING )
+                                        */
+                                        ( noper_trietype == NOTHING && trietype )
+                                        || ( trietype == noper_trietype )
                                   )
 #ifdef NOJUMPTRIE
                                   && noper_next == tail
@@ -3354,13 +3366,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 /* handle unmergable node -
                                  * noper may either be a triable node which can not be tried
                                  * together with the current trie, or a non triable node */
-                                if ( last && trietype != NOTHING ) {
-                                    /* if last is set then we have found at least two triable branch
-                                     * sequences in a row of a similar trietype so we can turn them
-                                     * into a trie */
-                                    make_trie( pRExC_state, 
-                                            startbranch, first, cur, tail, count, 
-                                            trietype, depth+1 );
+                                if ( last ) {
+                                    /* If last is set and trietype is not NOTHING then we have found
+                                     * at least two triable branch sequences in a row of a similar
+                                     * trietype so we can turn them into a trie. If/when we
+                                     * allow NOTHING to start a trie sequence this condition will be
+                                     * required, and it isn't expensive so we leave it in for now. */
+                                    if ( trietype != NOTHING )
+                                        make_trie( pRExC_state,
+                                                startbranch, first, cur, tail, count,
+                                                trietype, depth+1 );
                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
                                 }
                                 if ( noper_trietype
@@ -3480,8 +3495,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            UV uc;
            if (UTF) {
                const U8 * const s = (U8*)STRING(scan);
+               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                l = utf8_length(s, s + l);
-               uc = utf8_to_uvchr(s, NULL);
            } else {
                uc = *((U8*)STRING(scan));
            }
@@ -3575,8 +3590,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
            if (UTF) {
                const U8 * const s = (U8 *)STRING(scan);
+               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                l = utf8_length(s, s + l);
-               uc = utf8_to_uvchr(s, NULL);
            }
            else if (has_exactf_sharp_s) {
                RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
@@ -9822,7 +9837,10 @@ tryagain:
                              for (foldbuf = tmpbuf;
                                   foldlen;
                                   foldlen -= numlen) {
-                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+
+                                  /* tmpbuf has been constructed by us, so we
+                                   * know it is valid utf8 */
+                                  ender = valid_utf8_to_uvchr(foldbuf, &numlen);
                                   if (numlen > 0) {
                                        const STRLEN unilen = reguni(pRExC_state, ender, s);
                                        s       += unilen;
@@ -9858,7 +9876,7 @@ tryagain:
                          for (foldbuf = tmpbuf;
                               foldlen;
                               foldlen -= numlen) {
-                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              ender = valid_utf8_to_uvchr(foldbuf, &numlen);
                               if (numlen > 0) {
                                    const STRLEN unilen = reguni(pRExC_state, ender, s);
                                    len     += unilen;