regcomp.c: Simplify handling of EXACTFish nodes with 's' at edge
authorKarl Williamson <khw@cpan.org>
Thu, 20 Dec 2018 09:09:09 +0000 (02:09 -0700)
committerKarl Williamson <khw@cpan.org>
Wed, 26 Dec 2018 19:50:37 +0000 (12:50 -0700)
Commit 8a100c918ec81926c0536594df8ee1fcccb171da created node types for
handling an 's' at the leading edge, at the trailing edge, and at both
edges for nodes under /di that there is nothing else in that would
prevent them from being EXACTFU nodes.  If two of these get joined, it
could create an 'ss' sequence which can't be an EXACTFU node, for U+DF
would match them unconditionally.  Instead, under /di it should match
if and only if the target string is UTF-8 encoded.

I realized later that having three types becomes harder to deal with
when adding yet more node types, so this commit turns the three into
just one node type, indicating that at least one edge of the node is an
's'.

It also simplifies the parsing of the pattern and determining which node
to use.

pod/perldebguts.pod
regcomp.c
regcomp.sym
regnodes.h
t/porting/known_pod_issues.dat

index 100df82..6764742 100644 (file)
@@ -681,11 +681,8 @@ will be lost.
  EXACTFU_ONLY8    str        Like EXACTFU, but only UTF-8 encoded
                              targets can match
 
- EXACTFS_B_U      str        EXACTFU but begins with [Ss]; (string not
-                             UTF-8; compile-time only).
- EXACTFS_E_U      str        EXACTFU but ends with [Ss]; (string not UTF-
-                             8; compile-time only).
- EXACTFS_BE_U     str        EXACTFU but begins and ends with [Ss];
+ EXACTFU_S_EDGE   str        /di rules, but nothing in it precludes /ui,
+                             except begins and/or ends with [Ss];
                              (string not UTF-8; compile-time only).
 
  # Do nothing types
index a907786..0243803 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4021,108 +4021,85 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
             else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
                 ;   /* join is compatible, no need to change OP */
             }
-            else if (OP(scan) == EXACTFU) {
-                if (OP(n) != EXACTFU) {
-
-                    /* Here the first node is EXACTFU and the second isn't.
-                     * Normally EXACTFU nodes are compatible for joining only
-                     * with EXACTFU_ONLY8 nodes (already handled), and other
-                     * EXACTFU nodes.  But under /di, certain temporary
-                     * EXACTFS_foo_U nodes are generated, which are compatible.
-                     * We check for this case here.  These need to be resolved
-                     * to either EXACTFU or EXACTF at joining time.  They have
-                     * nothing in them that would forbid them from being the
-                     * more desirable EXACTFU nodes except that they begin
-                     * and/or end with a single [Ss].  The reason this is
-                     * problematic is because they could be joined in this loop
-                     * with an adjacent node that ends and/or begins with [Ss]
-                     * which would then form the sequence 'ss', which matches
-                     * differently under /di than /ui, in which case EXACTFU
-                     * can't be used.  If the 'ss' sequence doesn't get formed,
-                     * the nodes get absorbed into any adjacent EXACTFU node.
-                     * And if the only adjacent node is EXACTF, they get
-                     * absorbed into that, under the theory that a longer node
-                     * is better than two shorter ones, even if one is EXACTFU.
-                     * Note that EXACTFU_ONLY8 is generated only for UTF-8
-                     * patterns, and the EXACTFS_foo_U ones only for non-UTF-8.
-                     * */
-
-                    if (OP(n) == EXACTFS_E_U || OP(n) == EXACTFS_BE_U) {
-
-                        /* Here the joined node would end with 's'.  If the
-                         * node following the combination is an EXACTF one,
-                         * it's better to join this EXACTFS_fooE_U with that
-                         * one, leaving the current one in 'scan' be the more
-                         * desirable EXACTFU */
-                        if (OP(nnext) == EXACTF) {
-                            break;
-                        }
-                        OP(scan) = EXACTFS_E_U;
-                    }
-                    else if (OP(n) != EXACTFS_B_U) {
-                        break;  /* This would be an incompatible join; stop */
-                    }
-                }
+            else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
+                ;   /* join is compatible, no need to change OP */
             }
-            else if (OP(scan) == EXACTF) {
-                if (OP(n) != EXACTF) {
-
-                    /* Here the first node is EXACTF and the second isn't.
-                     * EXACTF nodes are compatible for joining only with other
-                     * EXACTF nodes, and the EXACTFS_foo_U nodes.  But the
-                     * latter nodes can be also joined with EXACTFU ones, and
-                     * that is a better outcome, so if the node following 'n'
-                     * is EXACTFU, quit now so that those two can be joined
-                     * later */
-                    if (   OP(n) != EXACTFS_B_U
-                        && OP(n) != EXACTFS_E_U
-                        && OP(n) != EXACTFS_BE_U)
-                    {
-                        break;
-                    }
-                    else if (OP(nnext) == EXACTFU) {
+            else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
+
+                 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
+                  * which can join with EXACTFU ones.  We check for this case
+                  * here.  These need to be resolved to either EXACTFU or
+                  * EXACTF at joining time.  They have nothing in them that
+                  * would forbid them from being the more desirable EXACTFU
+                  * nodes except that they begin and/or end with a single [Ss].
+                  * The reason this is problematic is because they could be
+                  * joined in this loop with an adjacent node that ends and/or
+                  * begins with [Ss] which would then form the sequence 'ss',
+                  * which matches differently under /di than /ui, in which case
+                  * EXACTFU can't be used.  If the 'ss' sequence doesn't get
+                  * formed, the nodes get absorbed into any adjacent EXACTFU
+                  * node.  And if the only adjacent node is EXACTF, they get
+                  * absorbed into that, under the theory that a longer node is
+                  * better than two shorter ones, even if one is EXACTFU.  Note
+                  * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
+                  * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
+
+                if (STRING(n)[STR_LEN(n)-1] == 's') {
+
+                    /* Here the joined node would end with 's'.  If the node
+                     * following the combination is an EXACTF one, it's better to
+                     * join this trailing edge 's' node with that one, leaving the
+                     * current one in 'scan' be the more desirable EXACTFU */
+                    if (OP(nnext) == EXACTF) {
                         break;
                     }
-                    else {
-                        /* Here the next node can be joined with the EXACTF
-                         * node, and become part of it.  That they begin or end
-                         * with 's' now doesn't matter. */
-                    }
+
+                    OP(scan) = EXACTFU_S_EDGE;
+
+                }   /* Otherwise, the beginning 's' of the 2nd node just
+                       becomes an interior 's' in 'scan' */
+            }
+            else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
+                ;   /* join is compatible, no need to change OP */
+            }
+            else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
+
+                /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
+                 * nodes.  But the latter nodes can be also joined with EXACTFU
+                 * ones, and that is a better outcome, so if the node following
+                 * 'n' is EXACTFU, quit now so that those two can be joined
+                 * later */
+                if (OP(nnext) == EXACTFU) {
+                    break;
                 }
+
+                /* The join is compatible, and the combined node will be
+                 * EXACTF.  (These don't care if they begin or end with 's' */
             }
-            else if (OP(scan) == EXACTFS_B_U) {
-
-                /* Here, the first node begins, but does not end with 's'.
-                 * That means it doesn't form 'ss' with the following node, so
-                 * can become EXACTFU, and either stand on its own or be joined
-                 * with a following EXACTFU.  If the following is instead an
-                 * EXACTF, the two can also be joined together as EXACTF */
-                if (OP(n) == EXACTF) {
+            else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
+                if (   STRING(scan)[STR_LEN(scan)-1] == 's'
+                    && STRING(n)[0] == 's')
+                {
+                    /* When combined, we have the sequence 'ss', which means we
+                     * have to remain /di */
                     OP(scan) = EXACTF;
                 }
-                else {
-                    OP(scan) = EXACTFU;
-                    if (OP(n) != EXACTFU) {
-                        break;
-                    }
-                }
             }
-            else if (OP(scan) == EXACTFS_E_U || OP(scan) == EXACTFS_BE_U) {
-
-                /* Here, the first node ends with 's', and could become an
-                 * EXACTFU (or be joined with a following EXACTFU) if that next
-                 * node doesn't begin with 's'; otherwise it must become an
-                 * EXACTF node. */
-                if (OP(n) == EXACTFS_B_U || OP(n) == EXACTFS_BE_U) {
-                    OP(scan) = EXACTF;
+            else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
+                if (STRING(n)[0] == 's') {
+                    ;   /* Here the join is compatible and the combined node
+                           starts with 's', no need to change OP */
                 }
-                else {
+                else {  /* Now the trailing 's' is in the interior */
                     OP(scan) = EXACTFU;
-                    if (OP(n) != EXACTFU) {
-                        break;
-                    }
                 }
             }
+            else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
+
+                /* The join is compatible, and the combined node will be
+                 * EXACTF.  (These don't care if they begin or end with 's' */
+                OP(scan) = EXACTF;
+            }
             else if (OP(scan) != OP(n)) {
 
                 /* The only other compatible joinings are the same node type */
@@ -4158,12 +4135,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
 #endif
     }
 
-    /* These temporary nodes can now be turned into EXACTFU, and must, as
-     * regexec.c doesn't handle them */
-    if (   OP(scan) == EXACTFS_B_U
-        || OP(scan) == EXACTFS_E_U
-        || OP(scan) == EXACTFS_BE_U)
-    {
+    /* This temporary node can now be turned into EXACTFU, and must, as
+     * regexec.c doesn't handle it */
+    if (OP(scan) == EXACTFU_S_EDGE) {
         OP(scan) = EXACTFU;
     }
 
@@ -5295,12 +5269,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            case STAR:
                 next = NEXTOPER(scan);
 
-                /* These temporary nodes can now be turned into EXACTFU, and
-                 * must, as regexec.c doesn't handle them */
-                if (   OP(next) == EXACTFS_B_U
-                    || OP(next) == EXACTFS_E_U
-                    || OP(next) == EXACTFS_BE_U)
-                {
+                /* This temporary node can now be turned into EXACTFU, and
+                 * must, as regexec.c doesn't handle it */
+                if (OP(next) == EXACTFU_S_EDGE) {
                     OP(next) = EXACTFU;
                 }
 
@@ -13935,15 +13906,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * contain only above-Latin1 characters (hence must be in UTF8),
              * which don't participate in folds with Latin1-range characters,
              * as the latter's folds aren't known until runtime. */
-            bool maybe_exactfu = FOLD;
-
-            /* An EXACTF node that otherwise could be turned into EXACTFU,
-             * can't be if it starts and/or ends with [Ss].  Because, during
-             * optimization it could be joined with another node that ends
-             * and/or starts with [Ss], creating the sequence 'ss', which needs
-             * to remain in an EXACTF node.  This flag is used to signal this
-             * situation */
-            bool maybe_exactfs = FALSE;
+            bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
 
             /* Single-character EXACTish nodes are almost always SIMPLE.  This
              * allows us to override this as encountered */
@@ -13953,6 +13916,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * target string is (also) in UTF-8 */
             bool requires_utf8_target = FALSE;
 
+            /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
+            bool has_ss = FALSE;
+
+            /* So is the MICRO SIGN */
             bool has_micro_sign = FALSE;
 
             /* Allocate an EXACT node.  The node_type may change below to
@@ -14090,7 +14057,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             if (! maybe_exactfu) {
                                 len = 0;
                                 s = s0;
-                                maybe_exactfu = FOLD;   /* Prob. unnecessary */
                                 goto reparse;
                             }
                         }
@@ -14417,11 +14383,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                                     : 0));
                             s += added_len;
 
-                            if (ender > 255)  {
+                            if (   ender > 255
+                                && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+                            {
+                                /* U+B5 folds to the MU, so its possible for a
+                                 * non-UTF-8 target to match it */
                                 requires_utf8_target = TRUE;
-                                if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
-                                    has_micro_sign = TRUE;
-                                }
                             }
                         }
                     }
@@ -14440,11 +14407,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* On non-ancient Unicode versions, this includes the
                          * multi-char fold SHARP S to 'ss' */
 
-                        if (len == 0 && isALPHA_FOLD_EQ(ender, 's')) {
-                            maybe_exactfs = TRUE;   /* Node begins with 's' */
-                        }
-                        else if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
+                        if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
                                  || (   isALPHA_FOLD_EQ(ender, 's')
+                                     && len > 0
                                      && isALPHA_FOLD_EQ(*(s-1), 's')))
                         {
                             /* Here, we have one of the following:
@@ -14464,9 +14429,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              *      string is in UTF-8.
                              * */
 
-                            maybe_exactfs = FALSE;  /* Can't generate an
-                                                       EXACTFS node */
-                            maybe_exactfu = FALSE;  /* Nor EXACTFU (unless we
+                            has_ss = TRUE;
+                            maybe_exactfu = FALSE;  /* Can't generate an
+                                                       EXACTFU node (unless we
                                                        already are in one) */
                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
                                 maybe_SIMPLE = 0;
@@ -14684,68 +14649,56 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     else if (requires_utf8_target) {
                         node_type = EXACT_ONLY8;
                     }
-                }
-
-                if (FOLD) {
-                    /* If the node ends in an 's' it can't now be changed into
-                     * an EXACTFU, as the node could later get joined with another
-                     * one that begins with 's' and that combination that would
-                     * then wrongly match the sharp s under /di.  (Note that if
-                     * it's already EXACTFU, this is irrelevant)  If this is
-                     * the only reason keeping it from being an EXACTFU, we
-                     * create a special node type so that at joining time, we
-                     * can turn it into an EXACTFU if no 'ss' is formed */
-                    if (isALPHA_FOLD_EQ(ender, 's')) {
-                        if (maybe_exactfu && node_type == EXACTF) {
-                            node_type = (maybe_exactfs)
-                                        ? EXACTFS_BE_U
-                                        : EXACTFS_E_U;
-                        }
-                        maybe_exactfu = FALSE;
+                } else if (FOLD) {
+                    if (    UNLIKELY(has_micro_sign || has_ss)
+                        && (node_type == EXACTFU || (   node_type == EXACTF
+                                                     && maybe_exactfu)))
+                    {   /* These two conditions are problematic in non-UTF-8
+                           EXACTFU nodes. */
+                        assert(! UTF);
+                        node_type = EXACTFUP;
                     }
+                    else if (node_type == EXACTFL) {
 
-                    /* If 'maybe_exactfu' is set, then there are no code points
-                     * that match differently depending on UTF8ness of the
-                     * target string (for /u), or depending on locale for /l */
-                    if (maybe_exactfu) {
-                        if (node_type == EXACTF) {
-                            node_type = EXACTFU;
-                        }
-                        else if (node_type == EXACTFL) {
+                        /* 'maybe_exactfu' is deliberately set above to
+                         * indicate this node type, where all code points in it
+                         * are above 255 */
+                        if (maybe_exactfu) {
                             node_type = EXACTFLU8;
                         }
                     }
-                    else if (node_type == EXACTF) {
-                        RExC_seen_d_op = TRUE;
-
-                        /* If the only thing keeping this from being EXACTFU is
-                         * that it begins with 's', change it to a special node
-                         * type so that during the later join, we can easily
-                         * check for, and do the change there if appropriate */
-                        if (maybe_exactfs) {
-                            node_type = EXACTFS_B_U;
+                    else if (node_type == EXACTF) {  /* Means is /di */
+
+                        /* If 'maybe_exactfu' is clear, then we need to stay
+                         * /di.  If it is set, it means there are no code
+                         * points that match differently depending on UTF8ness
+                         * of the target string, so it can become an EXACTFU
+                         * node */
+                        if (! maybe_exactfu) {
+                            RExC_seen_d_op = TRUE;
                         }
-                    }
-
-                    if (node_type == EXACTFU) {
-
-                        /* Because the MICRO SIGN folds to something
-                         * representable only in UTF-8, we use a special node
-                         * when we aren't in UTF-8, so can't represent that
-                         * fold */
-                        if (UNLIKELY(has_micro_sign)) {
-
-                            /* The micro sign is the only below 256 character
-                             * that folds to above 255 */
-                            if (! UTF) {
-                                node_type = EXACTFUP;
-                            }
+                        else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+                                 || isALPHA_FOLD_EQ(ender, 's'))
+                        {
+                            /* But, if the node begins or ends in an 's' we
+                             * have to defer changing it into an EXACTFU, as
+                             * the node could later get joined with another one
+                             * that ends or begins with 's' creating an 'ss'
+                             * sequence which would then wrongly match the
+                             * sharp s without the target being UTF-8.  We
+                             * create a special node that we resolve later when
+                             * we join nodes together */
+
+                            node_type = EXACTFU_S_EDGE;
                         }
-                        else if (requires_utf8_target) {
-
-                            node_type = EXACTFU_ONLY8;
+                        else {
+                            node_type = EXACTFU;
                         }
                     }
+
+                    if (requires_utf8_target && node_type == EXACTFU) {
+                        node_type = EXACTFU_ONLY8;
+                    }
                 }
 
                 OP(REGNODE_p(ret)) = node_type;
@@ -19512,9 +19465,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
                 case EXACT_ONLY8:
                 case EXACTL:
                 case EXACTF:
-                case EXACTFS_B_U:
-                case EXACTFS_E_U:
-                case EXACTFS_BE_U:
+                case EXACTFU_S_EDGE:
                 case EXACTFAA_NO_TRIE:
                 case EXACTFAA:
                 case EXACTFU:
index 8033a13..f9187c0 100644 (file)
@@ -123,11 +123,7 @@ EXACTFU_ONLY8 EXACT,    str       ; Like EXACTFU, but only UTF-8 encoded targets
 # One could add EXACTFAA8 and and something that has the same effect for /l,
 # but these would be extremely uncommon
 
-# If we ran out of node types, these could be replaced by some other method,
-# such as instead examining the first and final characters of nodes.
-EXACTFS_B_U EXACT,      str       ; EXACTFU but begins with [Ss]; (string not UTF-8; compile-time only).
-EXACTFS_E_U EXACT,      str       ; EXACTFU but ends with [Ss]; (string not UTF-8; compile-time only).
-EXACTFS_BE_U EXACT,     str       ; EXACTFU but begins and ends with [Ss]; (string not UTF-8; compile-time only).
+EXACTFU_S_EDGE EXACT,   str       ; /di rules, but nothing in it precludes /ui, except begins and/or ends with [Ss]; (string not UTF-8; compile-time only).
 
 #*Do nothing types
 
index da6a28c..d337046 100644 (file)
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            104
-#define REGMATCH_STATE_MAX     144
+#define REGNODE_MAX            102
+#define REGMATCH_STATE_MAX     142
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        EXACTFAA_NO_TRIE        44      /* 0x2c Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able). */
 #define        EXACT_ONLY8             45      /* 0x2d Like EXACT, but only UTF-8 encoded targets can match */
 #define        EXACTFU_ONLY8           46      /* 0x2e Like EXACTFU, but only UTF-8 encoded targets can match */
-#define        EXACTFS_B_U             47      /* 0x2f EXACTFU but begins with [Ss]; (string not UTF-8; compile-time only). */
-#define        EXACTFS_E_U             48      /* 0x30 EXACTFU but ends with [Ss]; (string not UTF-8; compile-time only). */
-#define        EXACTFS_BE_U            49      /* 0x31 EXACTFU but begins and ends with [Ss]; (string not UTF-8; compile-time only). */
-#define        NOTHING                 50      /* 0x32 Match empty string. */
-#define        TAIL                    51      /* 0x33 Match empty string. Can jump here from outside. */
-#define        STAR                    52      /* 0x34 Match this (simple) thing 0 or more times. */
-#define        PLUS                    53      /* 0x35 Match this (simple) thing 1 or more times. */
-#define        CURLY                   54      /* 0x36 Match this simple thing {n,m} times. */
-#define        CURLYN                  55      /* 0x37 Capture next-after-this simple thing */
-#define        CURLYM                  56      /* 0x38 Capture this medium-complex thing {n,m} times. */
-#define        CURLYX                  57      /* 0x39 Match this complex thing {n,m} times. */
-#define        WHILEM                  58      /* 0x3a Do curly processing and see if rest matches. */
-#define        OPEN                    59      /* 0x3b Mark this point in input as start of #n. */
-#define        CLOSE                   60      /* 0x3c Close corresponding OPEN of #n. */
-#define        SROPEN                  61      /* 0x3d Same as OPEN, but for script run */
-#define        SRCLOSE                 62      /* 0x3e Close preceding SROPEN */
-#define        REF                     63      /* 0x3f Match some already matched string */
-#define        REFF                    64      /* 0x40 Match already matched string, folded using native charset rules for non-utf8 */
-#define        REFFL                   65      /* 0x41 Match already matched string, folded in loc. */
-#define        REFFU                   66      /* 0x42 Match already matched string, folded using unicode rules for non-utf8 */
-#define        REFFA                   67      /* 0x43 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
-#define        NREF                    68      /* 0x44 Match some already matched string */
-#define        NREFF                   69      /* 0x45 Match already matched string, folded using native charset rules for non-utf8 */
-#define        NREFFL                  70      /* 0x46 Match already matched string, folded in loc. */
-#define        NREFFU                  71      /* 0x47 Match already matched string, folded using unicode rules for non-utf8 */
-#define        NREFFA                  72      /* 0x48 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
-#define        LONGJMP                 73      /* 0x49 Jump far away. */
-#define        BRANCHJ                 74      /* 0x4a BRANCH with long offset. */
-#define        IFMATCH                 75      /* 0x4b Succeeds if the following matches. */
-#define        UNLESSM                 76      /* 0x4c Fails if the following matches. */
-#define        SUSPEND                 77      /* 0x4d "Independent" sub-RE. */
-#define        IFTHEN                  78      /* 0x4e Switch, should be preceded by switcher. */
-#define        GROUPP                  79      /* 0x4f Whether the group matched. */
-#define        EVAL                    80      /* 0x50 Execute some Perl code. */
-#define        MINMOD                  81      /* 0x51 Next operator is not greedy. */
-#define        LOGICAL                 82      /* 0x52 Next opcode should set the flag only. */
-#define        RENUM                   83      /* 0x53 Group with independently numbered parens. */
-#define        TRIE                    84      /* 0x54 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define        TRIEC                   85      /* 0x55 Same as TRIE, but with embedded charclass data */
-#define        AHOCORASICK             86      /* 0x56 Aho Corasick stclass. flags==type */
-#define        AHOCORASICKC            87      /* 0x57 Same as AHOCORASICK, but with embedded charclass data */
-#define        GOSUB                   88      /* 0x58 recurse to paren arg1 at (signed) ofs arg2 */
-#define        NGROUPP                 89      /* 0x59 Whether the group matched. */
-#define        INSUBP                  90      /* 0x5a Whether we are in a specific recurse. */
-#define        DEFINEP                 91      /* 0x5b Never execute directly. */
-#define        ENDLIKE                 92      /* 0x5c Used only for the type field of verbs */
-#define        OPFAIL                  93      /* 0x5d Same as (?!), but with verb arg */
-#define        ACCEPT                  94      /* 0x5e Accepts the current matched string, with verbar */
-#define        VERB                    95      /* 0x5f Used only for the type field of verbs */
-#define        PRUNE                   96      /* 0x60 Pattern fails at this startpoint if no-backtracking through this */
-#define        MARKPOINT               97      /* 0x61 Push the current location for rollback by cut. */
-#define        SKIP                    98      /* 0x62 On failure skip forward (to the mark) before retrying */
-#define        COMMIT                  99      /* 0x63 Pattern fails outright if backtracking through this */
-#define        CUTGROUP                100     /* 0x64 On failure go to the next alternation in the group */
-#define        KEEPS                   101     /* 0x65 $& begins here. */
-#define        LNBREAK                 102     /* 0x66 generic newline pattern */
-#define        OPTIMIZED               103     /* 0x67 Placeholder for dump. */
-#define        PSEUDO                  104     /* 0x68 Pseudo opcode for internal use. */
+#define        EXACTFU_S_EDGE          47      /* 0x2f /di rules, but nothing in it precludes /ui, except begins and/or ends with [Ss]; (string not UTF-8; compile-time only). */
+#define        NOTHING                 48      /* 0x30 Match empty string. */
+#define        TAIL                    49      /* 0x31 Match empty string. Can jump here from outside. */
+#define        STAR                    50      /* 0x32 Match this (simple) thing 0 or more times. */
+#define        PLUS                    51      /* 0x33 Match this (simple) thing 1 or more times. */
+#define        CURLY                   52      /* 0x34 Match this simple thing {n,m} times. */
+#define        CURLYN                  53      /* 0x35 Capture next-after-this simple thing */
+#define        CURLYM                  54      /* 0x36 Capture this medium-complex thing {n,m} times. */
+#define        CURLYX                  55      /* 0x37 Match this complex thing {n,m} times. */
+#define        WHILEM                  56      /* 0x38 Do curly processing and see if rest matches. */
+#define        OPEN                    57      /* 0x39 Mark this point in input as start of #n. */
+#define        CLOSE                   58      /* 0x3a Close corresponding OPEN of #n. */
+#define        SROPEN                  59      /* 0x3b Same as OPEN, but for script run */
+#define        SRCLOSE                 60      /* 0x3c Close preceding SROPEN */
+#define        REF                     61      /* 0x3d Match some already matched string */
+#define        REFF                    62      /* 0x3e Match already matched string, folded using native charset rules for non-utf8 */
+#define        REFFL                   63      /* 0x3f Match already matched string, folded in loc. */
+#define        REFFU                   64      /* 0x40 Match already matched string, folded using unicode rules for non-utf8 */
+#define        REFFA                   65      /* 0x41 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
+#define        NREF                    66      /* 0x42 Match some already matched string */
+#define        NREFF                   67      /* 0x43 Match already matched string, folded using native charset rules for non-utf8 */
+#define        NREFFL                  68      /* 0x44 Match already matched string, folded in loc. */
+#define        NREFFU                  69      /* 0x45 Match already matched string, folded using unicode rules for non-utf8 */
+#define        NREFFA                  70      /* 0x46 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
+#define        LONGJMP                 71      /* 0x47 Jump far away. */
+#define        BRANCHJ                 72      /* 0x48 BRANCH with long offset. */
+#define        IFMATCH                 73      /* 0x49 Succeeds if the following matches. */
+#define        UNLESSM                 74      /* 0x4a Fails if the following matches. */
+#define        SUSPEND                 75      /* 0x4b "Independent" sub-RE. */
+#define        IFTHEN                  76      /* 0x4c Switch, should be preceded by switcher. */
+#define        GROUPP                  77      /* 0x4d Whether the group matched. */
+#define        EVAL                    78      /* 0x4e Execute some Perl code. */
+#define        MINMOD                  79      /* 0x4f Next operator is not greedy. */
+#define        LOGICAL                 80      /* 0x50 Next opcode should set the flag only. */
+#define        RENUM                   81      /* 0x51 Group with independently numbered parens. */
+#define        TRIE                    82      /* 0x52 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define        TRIEC                   83      /* 0x53 Same as TRIE, but with embedded charclass data */
+#define        AHOCORASICK             84      /* 0x54 Aho Corasick stclass. flags==type */
+#define        AHOCORASICKC            85      /* 0x55 Same as AHOCORASICK, but with embedded charclass data */
+#define        GOSUB                   86      /* 0x56 recurse to paren arg1 at (signed) ofs arg2 */
+#define        NGROUPP                 87      /* 0x57 Whether the group matched. */
+#define        INSUBP                  88      /* 0x58 Whether we are in a specific recurse. */
+#define        DEFINEP                 89      /* 0x59 Never execute directly. */
+#define        ENDLIKE                 90      /* 0x5a Used only for the type field of verbs */
+#define        OPFAIL                  91      /* 0x5b Same as (?!), but with verb arg */
+#define        ACCEPT                  92      /* 0x5c Accepts the current matched string, with verbar */
+#define        VERB                    93      /* 0x5d Used only for the type field of verbs */
+#define        PRUNE                   94      /* 0x5e Pattern fails at this startpoint if no-backtracking through this */
+#define        MARKPOINT               95      /* 0x5f Push the current location for rollback by cut. */
+#define        SKIP                    96      /* 0x60 On failure skip forward (to the mark) before retrying */
+#define        COMMIT                  97      /* 0x61 Pattern fails outright if backtracking through this */
+#define        CUTGROUP                98      /* 0x62 On failure go to the next alternation in the group */
+#define        KEEPS                   99      /* 0x63 $& begins here. */
+#define        LNBREAK                 100     /* 0x64 generic newline pattern */
+#define        OPTIMIZED               101     /* 0x65 Placeholder for dump. */
+#define        PSEUDO                  102     /* 0x66 Pseudo opcode for internal use. */
        /* ------------ States ------------- */
 #define        TRIE_next               (REGNODE_MAX + 1)       /* state for TRIE */
 #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for TRIE */
@@ -211,9 +209,7 @@ EXTCONST U8 PL_regkind[] = {
        EXACT,          /* EXACTFAA_NO_TRIE       */
        EXACT,          /* EXACT_ONLY8            */
        EXACT,          /* EXACTFU_ONLY8          */
-       EXACT,          /* EXACTFS_B_U            */
-       EXACT,          /* EXACTFS_E_U            */
-       EXACT,          /* EXACTFS_BE_U           */
+       EXACT,          /* EXACTFU_S_EDGE         */
        NOTHING,        /* NOTHING                */
        NOTHING,        /* TAIL                   */
        STAR,           /* STAR                   */
@@ -365,9 +361,7 @@ static const U8 regarglen[] = {
        0,                                      /* EXACTFAA_NO_TRIE */
        0,                                      /* EXACT_ONLY8  */
        0,                                      /* EXACTFU_ONLY8 */
-       0,                                      /* EXACTFS_B_U  */
-       0,                                      /* EXACTFS_E_U  */
-       0,                                      /* EXACTFS_BE_U */
+       0,                                      /* EXACTFU_S_EDGE */
        0,                                      /* NOTHING      */
        0,                                      /* TAIL         */
        0,                                      /* STAR         */
@@ -475,9 +469,7 @@ static const char reg_off_by_arg[] = {
        0,      /* EXACTFAA_NO_TRIE */
        0,      /* EXACT_ONLY8  */
        0,      /* EXACTFU_ONLY8 */
-       0,      /* EXACTFS_B_U  */
-       0,      /* EXACTFS_E_U  */
-       0,      /* EXACTFS_BE_U */
+       0,      /* EXACTFU_S_EDGE */
        0,      /* NOTHING      */
        0,      /* TAIL         */
        0,      /* STAR         */
@@ -591,64 +583,62 @@ EXTCONST char * const PL_reg_name[] = {
        "EXACTFAA_NO_TRIE",             /* 0x2c */
        "EXACT_ONLY8",                  /* 0x2d */
        "EXACTFU_ONLY8",                /* 0x2e */
-       "EXACTFS_B_U",                  /* 0x2f */
-       "EXACTFS_E_U",                  /* 0x30 */
-       "EXACTFS_BE_U",                 /* 0x31 */
-       "NOTHING",                      /* 0x32 */
-       "TAIL",                         /* 0x33 */
-       "STAR",                         /* 0x34 */
-       "PLUS",                         /* 0x35 */
-       "CURLY",                        /* 0x36 */
-       "CURLYN",                       /* 0x37 */
-       "CURLYM",                       /* 0x38 */
-       "CURLYX",                       /* 0x39 */
-       "WHILEM",                       /* 0x3a */
-       "OPEN",                         /* 0x3b */
-       "CLOSE",                        /* 0x3c */
-       "SROPEN",                       /* 0x3d */
-       "SRCLOSE",                      /* 0x3e */
-       "REF",                          /* 0x3f */
-       "REFF",                         /* 0x40 */
-       "REFFL",                        /* 0x41 */
-       "REFFU",                        /* 0x42 */
-       "REFFA",                        /* 0x43 */
-       "NREF",                         /* 0x44 */
-       "NREFF",                        /* 0x45 */
-       "NREFFL",                       /* 0x46 */
-       "NREFFU",                       /* 0x47 */
-       "NREFFA",                       /* 0x48 */
-       "LONGJMP",                      /* 0x49 */
-       "BRANCHJ",                      /* 0x4a */
-       "IFMATCH",                      /* 0x4b */
-       "UNLESSM",                      /* 0x4c */
-       "SUSPEND",                      /* 0x4d */
-       "IFTHEN",                       /* 0x4e */
-       "GROUPP",                       /* 0x4f */
-       "EVAL",                         /* 0x50 */
-       "MINMOD",                       /* 0x51 */
-       "LOGICAL",                      /* 0x52 */
-       "RENUM",                        /* 0x53 */
-       "TRIE",                         /* 0x54 */
-       "TRIEC",                        /* 0x55 */
-       "AHOCORASICK",                  /* 0x56 */
-       "AHOCORASICKC",                 /* 0x57 */
-       "GOSUB",                        /* 0x58 */
-       "NGROUPP",                      /* 0x59 */
-       "INSUBP",                       /* 0x5a */
-       "DEFINEP",                      /* 0x5b */
-       "ENDLIKE",                      /* 0x5c */
-       "OPFAIL",                       /* 0x5d */
-       "ACCEPT",                       /* 0x5e */
-       "VERB",                         /* 0x5f */
-       "PRUNE",                        /* 0x60 */
-       "MARKPOINT",                    /* 0x61 */
-       "SKIP",                         /* 0x62 */
-       "COMMIT",                       /* 0x63 */
-       "CUTGROUP",                     /* 0x64 */
-       "KEEPS",                        /* 0x65 */
-       "LNBREAK",                      /* 0x66 */
-       "OPTIMIZED",                    /* 0x67 */
-       "PSEUDO",                       /* 0x68 */
+       "EXACTFU_S_EDGE",               /* 0x2f */
+       "NOTHING",                      /* 0x30 */
+       "TAIL",                         /* 0x31 */
+       "STAR",                         /* 0x32 */
+       "PLUS",                         /* 0x33 */
+       "CURLY",                        /* 0x34 */
+       "CURLYN",                       /* 0x35 */
+       "CURLYM",                       /* 0x36 */
+       "CURLYX",                       /* 0x37 */
+       "WHILEM",                       /* 0x38 */
+       "OPEN",                         /* 0x39 */
+       "CLOSE",                        /* 0x3a */
+       "SROPEN",                       /* 0x3b */
+       "SRCLOSE",                      /* 0x3c */
+       "REF",                          /* 0x3d */
+       "REFF",                         /* 0x3e */
+       "REFFL",                        /* 0x3f */
+       "REFFU",                        /* 0x40 */
+       "REFFA",                        /* 0x41 */
+       "NREF",                         /* 0x42 */
+       "NREFF",                        /* 0x43 */
+       "NREFFL",                       /* 0x44 */
+       "NREFFU",                       /* 0x45 */
+       "NREFFA",                       /* 0x46 */
+       "LONGJMP",                      /* 0x47 */
+       "BRANCHJ",                      /* 0x48 */
+       "IFMATCH",                      /* 0x49 */
+       "UNLESSM",                      /* 0x4a */
+       "SUSPEND",                      /* 0x4b */
+       "IFTHEN",                       /* 0x4c */
+       "GROUPP",                       /* 0x4d */
+       "EVAL",                         /* 0x4e */
+       "MINMOD",                       /* 0x4f */
+       "LOGICAL",                      /* 0x50 */
+       "RENUM",                        /* 0x51 */
+       "TRIE",                         /* 0x52 */
+       "TRIEC",                        /* 0x53 */
+       "AHOCORASICK",                  /* 0x54 */
+       "AHOCORASICKC",                 /* 0x55 */
+       "GOSUB",                        /* 0x56 */
+       "NGROUPP",                      /* 0x57 */
+       "INSUBP",                       /* 0x58 */
+       "DEFINEP",                      /* 0x59 */
+       "ENDLIKE",                      /* 0x5a */
+       "OPFAIL",                       /* 0x5b */
+       "ACCEPT",                       /* 0x5c */
+       "VERB",                         /* 0x5d */
+       "PRUNE",                        /* 0x5e */
+       "MARKPOINT",                    /* 0x5f */
+       "SKIP",                         /* 0x60 */
+       "COMMIT",                       /* 0x61 */
+       "CUTGROUP",                     /* 0x62 */
+       "KEEPS",                        /* 0x63 */
+       "LNBREAK",                      /* 0x64 */
+       "OPTIMIZED",                    /* 0x65 */
+       "PSEUDO",                       /* 0x66 */
        /* ------------ States ------------- */
        "TRIE_next",                    /* REGNODE_MAX +0x01 */
        "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
@@ -783,7 +773,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0xF0, 0x87, 0xFF, 0x65, 0x00, 0x00, 0x00, 0x00
+    0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0xFC, 0xE1, 0x7F, 0x19, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
@@ -806,7 +796,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0x00, 0xFF, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    0x00, 0x00, 0xFF, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
index ee15556..671f6c7 100644 (file)
@@ -355,7 +355,7 @@ pod/perl.pod        Verbatim line length including indents exceeds 79 by    8
 pod/perlandroid.pod    Verbatim line length including indents exceeds 79 by    3
 pod/perlbook.pod       Verbatim line length including indents exceeds 79 by    1
 pod/perlce.pod Verbatim line length including indents exceeds 79 by    3
-pod/perldebguts.pod    Verbatim line length including indents exceeds 79 by    28
+pod/perldebguts.pod    Verbatim line length including indents exceeds 79 by    27
 pod/perldebtut.pod     Verbatim line length including indents exceeds 79 by    3
 pod/perldtrace.pod     Verbatim line length including indents exceeds 79 by    7
 pod/perlgit.pod        ? Should you be using F<...> or maybe L<...> instead of 1