From 3146c00a633e9cbed741e10146662fbcedfdb8d3 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Wed, 5 Jun 2019 11:15:00 -0600
Subject: [PATCH] Add ANYOFHr regnode

This commit adds a new regnode, ANYOFHr, like ANYOFH, but it also has a
loose upper bound for the first UTF-8 byte matchable by the node.  (The
'r' stands for 'range').  It would be nice to have a tight upper bound,
but to do so requires 4 more bits than are available without changing
the node arguments types, and hence increasing the node size.  Having a
loose bound is better than no bound, and comes essentially free, by
using two unused bits in the current ANYOFH node, and requiring only a
few extra, pipeline-able, mask, etc instructions at run time, no extra
conditionals.  Any ANYOFH nodes that would benefit from having an upper
bound will instead be compiled into this node type.

Its use is based on the following observations.

There are 64 possible start bytes, so the full range can be expressed in
6 bits.  This means that the flags field in ANYOFH nodes containing the
start byte has two extra bits that can be used for something else.

An ANYOFH node only happens when there is no matching code point in the
bit map, so the smallest code point that could be is 256.  The start
byte for that is C4, so there are actually only 60 possible start bytes.
(perl can be compiled with a larger bit map in which case the minimum
start byte would be even higher.)

A second observation is that knowing the highest start byte is above F0
is no better than knowing it's F0.  This is because the highest code
point whose start byte is F0 is U+3FFFF, and all code points above that
that are currently allocated are all very specialized and rarely
encountered.  And there's no likelihood of that changing anytime soon as
there's plenty of unallocated space below that.  So if an ANYOFH node's
highest start byte is F0 or above, there's no advantage to knowing what
the actual max possible start byte is, so leave it as ANYOFH,.

That means the highest start byte we care about in ANYOFHr is EF.  That
cuts the number of start bytes we care about down to 43, still 6 bits
required to represent them, but it allows for the following scheme:

Populate the flags field by subtracting C0 from the lowest start byte
and shift left 2 bits.  That leaves the the bottom two bits unused.
We use them as follows, where x is the start byte of the lowest code
point in the node:

bits
----
11  The upper limit of the range can be as much as (EF - x) / 8
10  The upper limit of the range can be as much as (EF - x) / 4
01  The upper limit of the range can be as much as (EF - x) / 2
00  The upper limit of the range can be as much as  EF

That partitions the loose upper bound into 4 possible ranges, with it
being tighter the closer it is to the strict lower bound.  This makes
the loose upper bound more meaningful when there is most to gain by
having one.

Some examples of what the various upper bounds would be for all the
possibilities of these two bits are:
           Upper bound given the 2 bits
Low bound  11 10 01 00
---------  -- -- -- --
C4         C9 CE D9 EF
D0         D3 D7 DF EF
E0         E1 E3 E7 EF

Start bytes of E0 and above represent many more code points each than
lower ones, as they are 3 byte sequences instead of two.  This scheme
provides tighter bounds for them, which is also a point in its favor.

Thus we have provided a loose upper bound using two otherwise unused
bits.  An alternate scheme could have had the intervals all the same,
but this provides a tighter bound when it makes the most sense to.

For EBCDIC the range is is from C8 to F4,

Tests will be added in a later commit
---
 pod/perldebguts.pod |   3 +
 regcomp.c           |  71 ++++++++---
 regcomp.h           |  24 ++++
 regcomp.sym         |   1 +
 regexec.c           |  42 ++++++-
 regnodes.h          | 333 ++++++++++++++++++++++++++--------------------------
 6 files changed, 291 insertions(+), 183 deletions(-)

diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index 15b716b..b924689 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -618,6 +618,9 @@ will be lost.
                              byte
  ANYOFHb          sv 1       Like ANYOFH, but all matches share the same
                              UTF-8 start byte, given in the flags field
+ ANYOFHr          sv 1       Like ANYOFH, but the flags field contains
+                             packed bounds for all matchable UTF-8 start
+                             bytes.
 
  ANYOFM           byte 1     Like ANYOF, but matches an invariant byte
                              as determined by the mask and arg
diff --git a/regcomp.c b/regcomp.c
index 0046039..fcde30d 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1582,7 +1582,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     unsigned int i;
     const U32 n = ARG(node);
     bool new_node_has_latin1 = FALSE;
-    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHb))
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
                       ? 0
                       : ANYOF_FLAGS(node);
 
@@ -1637,7 +1637,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     }
 
     /* Add in the points from the bit map */
-    if (! inRANGE(OP(node), ANYOFH, ANYOFHb)) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
             if (ANYOF_BITMAP_TEST(node, i)) {
                 unsigned int start = i++;
@@ -1724,7 +1724,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * another SSC or a regular ANYOF class.  Can create false positives. */
 
     SV* anded_cp_list;
-    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHb)
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
                           ? 0
                           : ANYOF_FLAGS(and_with);
     U8  anded_flags;
@@ -1910,7 +1910,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 
     SV* ored_cp_list;
     U8 ored_flags;
-    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHb)
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
                          ? 0
                          : ANYOF_FLAGS(or_with);
 
@@ -5851,6 +5851,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                 case ANYOFPOSIXL:
                 case ANYOFH:
                 case ANYOFHb:
+                case ANYOFHr:
                 case ANYOF:
 		    if (flags & SCF_DO_STCLASS_AND)
 			ssc_and(pRExC_state, data->start_class,
@@ -14794,7 +14795,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
     assert(PL_regkind[OP(node)] == ANYOF);
 
     /* There is no bitmap for this node type */
-    if (inRANGE(OP(node), ANYOFH, ANYOFHb)) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
         return;
     }
 
@@ -19048,15 +19049,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              * to code point.  For EBCDIC, this has to be I8. */
             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
 
-            /* If the lowest and highest code point in the class have the same
-             * UTF-8 first byte, then all have that byte, and we can get an
-             * exact first byte instead of a minimum.  We signal this with a
-             * different regnode */
+            /* If the first UTF-8 start byte for the highest code point in the
+             * range is suitably small, we may be able to get an upper bound as
+             * well */
             if (highest_cp <= IV_MAX) {
                 U8 high_utf8[UTF8_MAXBYTES+1];
 
                 (void) uvchr_to_utf8(high_utf8, highest_cp);
 
+                /* If the lowest and highest are the same, we can get an exact
+                 * first byte instead of a just minimum.  We signal this with a
+                 * different regnode */
                 if (low_utf8[0] == high_utf8[0]) {
 
                     /* No need to convert to I8 for EBCDIC as this is an exact
@@ -19064,6 +19067,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     anyof_flags = low_utf8[0];
                     op = ANYOFHb;
                 }
+                else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+                {
+
+                    /* Here, the high byte is not the same as the low, but is
+                     * small enough that its reasonable to have a loose upper
+                     * bound, which is packed in with the strict lower bound.
+                     * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+                     * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
+                     * is the same thing as UTF-8 */
+
+                    U8 bits = 0;
+                    U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+                    U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+                                  - anyof_flags;
+
+                    if (range_diff <= max_range_diff / 8) {
+                        bits = 3;
+                    }
+                    else if (range_diff <= max_range_diff / 4) {
+                        bits = 2;
+                    }
+                    else if (range_diff <= max_range_diff / 2) {
+                        bits = 1;
+                    }
+                    anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+                    op = ANYOFHr;
+                }
             }
 
             goto done_finding_op;
@@ -20366,7 +20396,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* 2: embedded, otherwise 1 */
 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
     else if (k == ANYOF) {
-	const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHb)
+	const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
                           ? 0
                           : ANYOF_FLAGS(o);
         bool do_sep = FALSE;    /* Do we need to separate various components of
@@ -20424,7 +20454,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* Ready to start outputting.  First, the initial left bracket */
 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
 
-        if (! inRANGE(OP(o), ANYOFH, ANYOFHb)) {
+        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
             /* Then all the things that could fit in the bitmap */
             do_sep = put_charclass_bitmap_innards(sv,
                                                   ANYOF_BITMAP(o),
@@ -20522,17 +20552,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And finally the matching, closing ']' */
 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
 
-        if (inRANGE(OP(o), ANYOFH, ANYOFHb)) {
-            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x", FLAGS(o));
-            if (OP(o) == ANYOFH) {
-                /* Not strictly true for 32-bit or EBCDIC, but good
-                 * enough */
-                Perl_sv_catpvf(aTHX_ sv, "..\\xff");
+        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+            U8 lowest = (OP(o) != ANYOFHr)
+                         ? FLAGS(o)
+                         : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+            U8 highest = (OP(o) == ANYOFHb)
+                         ? lowest
+                         : OP(o) == ANYOFH
+                           ? 0xFF
+                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+            if (lowest != highest) {
+                Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
             }
             Perl_sv_catpvf(aTHX_ sv, ")");
         }
 
-
         SvREFCNT_dec(unresolved);
     }
     else if (k == ANYOFM) {
diff --git a/regcomp.h b/regcomp.h
index 5002e2b..ac61408 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -1126,6 +1126,30 @@ typedef enum {
 	WB_BOUND
 } bound_type;
 
+/* This unpacks the FLAGS field of ANYOFHx nodes.  The value it contains
+ * gives the strict lower bound for the UTF-8 start byte of any code point
+ * matchable by the node, and a loose upper bound as well.
+ *
+ * The low bound is stored in the upper 6 bits, plus 0xC0.
+ * The loose upper bound is determined from the lowest 2 bits and the low bound
+ * (called x) as follows:
+ *
+ * 11  The upper limit of the range can be as much as (EF - x) / 8
+ * 10  The upper limit of the range can be as much as (EF - x) / 4
+ * 01  The upper limit of the range can be as much as (EF - x) / 2
+ * 00  The upper limit of the range can be as much as  EF
+ *
+ * For motivation of this design, see the commit message */
+#ifdef EBCDIC
+#  define MAX_ANYOF_HRx_BYTE  0xF4
+#else
+#  define MAX_ANYOF_HRx_BYTE  0xEF
+#endif
+#define LOWEST_ANYOF_HRx_BYTE(b) (((b) >> 2) + 0xC0)
+#define HIGHEST_ANYOF_HRx_BYTE(b)                                           \
+                                  (LOWEST_ANYOF_HRx_BYTE(b)                 \
+          + ((MAX_ANYOF_HRx_BYTE - LOWEST_ANYOF_HRx_BYTE(b)) >> ((b) & 3)))
+
 #endif /* PERL_REGCOMP_H_ */
 
 /*
diff --git a/regcomp.sym b/regcomp.sym
index 9e2c6d3..522293c 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -66,6 +66,7 @@ ANYOFPOSIXL ANYOF,      sv charclass_posixl S    ; Like ANYOFL, but matches [[:p
 # Must be sequential
 ANYOFH      ANYOF,      sv 1 S    ; Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte
 ANYOFHb     ANYOF,      sv 1 S    ; Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field
+ANYOFHr     ANYOF,      sv 1 S    ; Like ANYOFH, but the flags field contains packed bounds for all matchable UTF-8 start bytes.
 
 ANYOFM      ANYOFM      byte 1 S  ; Like ANYOF, but matches an invariant byte as determined by the mask and arg
 NANYOFM     ANYOFM      byte 1 S  ; complement of ANYOFM
diff --git a/regexec.c b/regexec.c
index a205696..e4ec07e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2195,6 +2195,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         }
         break;
 
+    case ANYOFHr:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   inRANGE((U8) NATIVE_UTF8_TO_I8(*s),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+        }
+        break;
+
     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
         assert(! is_utf8_pat);
 	/* FALLTHROUGH */
@@ -6827,6 +6837,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             goto increment_locinput;
             break;
 
+        case ANYOFHr:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
+                             LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
+                             HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
+	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                   utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
         /* The argument (FLAGS) to all the POSIX node types is the class number
          * */
 
@@ -9619,6 +9643,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         }
         break;
 
+    case ANYOFHr:
+        if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
+            while (  hardcount < max
+                   && scan < this_eol
+                   && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        break;
+
     /* The argument (FLAGS) to all the POSIX node types is the class number */
 
     case NPOSIXL:
@@ -9862,7 +9902,7 @@ STATIC bool
 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
 {
     dVAR;
-    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHb))
+    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr))
                         ? 0
                         : ANYOF_FLAGS(n);
     bool match = FALSE;
diff --git a/regnodes.h b/regnodes.h
index 5e39b50..211980d 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX           	102
-#define REGMATCH_STATE_MAX    	142
+#define REGNODE_MAX           	103
+#define REGMATCH_STATE_MAX    	143
 
 #define	END                   	0	/* 0000 End of program. */
 #define	SUCCEED               	1	/* 0x01 Return from a subroutine, basically. */
@@ -35,85 +35,86 @@
 #define	ANYOFPOSIXL           	21	/* 0x15 Like ANYOFL, but matches [[:posix:]] classes */
 #define	ANYOFH                	22	/* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte */
 #define	ANYOFHb               	23	/* 0x17 Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field */
-#define	ANYOFM                	24	/* 0x18 Like ANYOF, but matches an invariant byte as determined by the mask and arg */
-#define	NANYOFM               	25	/* 0x19 complement of ANYOFM */
-#define	POSIXD                	26	/* 0x1a Some [[:class:]] under /d; the FLAGS field gives which one */
-#define	POSIXL                	27	/* 0x1b Some [[:class:]] under /l; the FLAGS field gives which one */
-#define	POSIXU                	28	/* 0x1c Some [[:class:]] under /u; the FLAGS field gives which one */
-#define	POSIXA                	29	/* 0x1d Some [[:class:]] under /a; the FLAGS field gives which one */
-#define	NPOSIXD               	30	/* 0x1e complement of POSIXD, [[:^class:]] */
-#define	NPOSIXL               	31	/* 0x1f complement of POSIXL, [[:^class:]] */
-#define	NPOSIXU               	32	/* 0x20 complement of POSIXU, [[:^class:]] */
-#define	NPOSIXA               	33	/* 0x21 complement of POSIXA, [[:^class:]] */
-#define	CLUMP                 	34	/* 0x22 Match any extended grapheme cluster sequence */
-#define	BRANCH                	35	/* 0x23 Match this alternative, or the next... */
-#define	EXACT                 	36	/* 0x24 Match this string (preceded by length). */
-#define	EXACTL                	37	/* 0x25 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for). */
-#define	EXACTF                	38	/* 0x26 Match this string using /id rules (w/len); (string not UTF-8, not guaranteed to be folded). */
-#define	EXACTFL               	39	/* 0x27 Match this string using /il rules (w/len); (string not guaranteed to be folded). */
-#define	EXACTFU               	40	/* 0x28 Match this string using /iu rules (w/len); (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
-#define	EXACTFAA              	41	/* 0x29 Match this string using /iaa rules (w/len) (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
-#define	EXACTFUP              	42	/* 0x2a Match this string using /iu rules (w/len); (string not UTF-8, not guaranteed to be folded; and its Problematic). */
-#define	EXACTFLU8             	43	/* 0x2b Like EXACTFU, but use /il, UTF-8, folded, and everything in it is above 255. */
-#define	EXACTFAA_NO_TRIE      	44	/* 0x2c Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able). */
-#define	EXACT_ONLY8           	45	/* 0x2d Like EXACT, but only UTF-8 encoded targets can match */
-#define	EXACTFU_ONLY8         	46	/* 0x2e Like EXACTFU, but only UTF-8 encoded targets can match */
-#define	EXACTFU_S_EDGE        	47	/* 0x2f /di rules, but nothing in it precludes /ui, except begins and/or ends with [Ss]; (string not UTF-8; compile-time only). */
-#define	NOTHING               	48	/* 0x30 Match empty string. */
-#define	TAIL                  	49	/* 0x31 Match empty string. Can jump here from outside. */
-#define	STAR                  	50	/* 0x32 Match this (simple) thing 0 or more times. */
-#define	PLUS                  	51	/* 0x33 Match this (simple) thing 1 or more times. */
-#define	CURLY                 	52	/* 0x34 Match this simple thing {n,m} times. */
-#define	CURLYN                	53	/* 0x35 Capture next-after-this simple thing */
-#define	CURLYM                	54	/* 0x36 Capture this medium-complex thing {n,m} times. */
-#define	CURLYX                	55	/* 0x37 Match this complex thing {n,m} times. */
-#define	WHILEM                	56	/* 0x38 Do curly processing and see if rest matches. */
-#define	OPEN                  	57	/* 0x39 Mark this point in input as start of #n. */
-#define	CLOSE                 	58	/* 0x3a Close corresponding OPEN of #n. */
-#define	SROPEN                	59	/* 0x3b Same as OPEN, but for script run */
-#define	SRCLOSE               	60	/* 0x3c Close preceding SROPEN */
-#define	REF                   	61	/* 0x3d Match some already matched string */
-#define	REFF                  	62	/* 0x3e Match already matched string, using /di rules. */
-#define	REFFL                 	63	/* 0x3f Match already matched string, using /li rules. */
-#define	REFFU                 	64	/* 0x40 Match already matched string, usng /ui. */
-#define	REFFA                 	65	/* 0x41 Match already matched string, using /aai rules. */
-#define	REFN                  	66	/* 0x42 Match some already matched string */
-#define	REFFN                 	67	/* 0x43 Match already matched string, using /di rules. */
-#define	REFFLN                	68	/* 0x44 Match already matched string, using /li rules. */
-#define	REFFUN                	69	/* 0x45 Match already matched string, using /ui rules. */
-#define	REFFAN                	70	/* 0x46 Match already matched string, using /aai rules. */
-#define	LONGJMP               	71	/* 0x47 Jump far away. */
-#define	BRANCHJ               	72	/* 0x48 BRANCH with long offset. */
-#define	IFMATCH               	73	/* 0x49 Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
-#define	UNLESSM               	74	/* 0x4a Fails if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
-#define	SUSPEND               	75	/* 0x4b "Independent" sub-RE. */
-#define	IFTHEN                	76	/* 0x4c Switch, should be preceded by switcher. */
-#define	GROUPP                	77	/* 0x4d Whether the group matched. */
-#define	EVAL                  	78	/* 0x4e Execute some Perl code. */
-#define	MINMOD                	79	/* 0x4f Next operator is not greedy. */
-#define	LOGICAL               	80	/* 0x50 Next opcode should set the flag only. */
-#define	RENUM                 	81	/* 0x51 Group with independently numbered parens. */
-#define	TRIE                  	82	/* 0x52 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define	TRIEC                 	83	/* 0x53 Same as TRIE, but with embedded charclass data */
-#define	AHOCORASICK           	84	/* 0x54 Aho Corasick stclass. flags==type */
-#define	AHOCORASICKC          	85	/* 0x55 Same as AHOCORASICK, but with embedded charclass data */
-#define	GOSUB                 	86	/* 0x56 recurse to paren arg1 at (signed) ofs arg2 */
-#define	GROUPPN               	87	/* 0x57 Whether the group matched. */
-#define	INSUBP                	88	/* 0x58 Whether we are in a specific recurse. */
-#define	DEFINEP               	89	/* 0x59 Never execute directly. */
-#define	ENDLIKE               	90	/* 0x5a Used only for the type field of verbs */
-#define	OPFAIL                	91	/* 0x5b Same as (?!), but with verb arg */
-#define	ACCEPT                	92	/* 0x5c Accepts the current matched string, with verbar */
-#define	VERB                  	93	/* 0x5d Used only for the type field of verbs */
-#define	PRUNE                 	94	/* 0x5e Pattern fails at this startpoint if no-backtracking through this */
-#define	MARKPOINT             	95	/* 0x5f Push the current location for rollback by cut. */
-#define	SKIP                  	96	/* 0x60 On failure skip forward (to the mark) before retrying */
-#define	COMMIT                	97	/* 0x61 Pattern fails outright if backtracking through this */
-#define	CUTGROUP              	98	/* 0x62 On failure go to the next alternation in the group */
-#define	KEEPS                 	99	/* 0x63 $& begins here. */
-#define	LNBREAK               	100	/* 0x64 generic newline pattern */
-#define	OPTIMIZED             	101	/* 0x65 Placeholder for dump. */
-#define	PSEUDO                	102	/* 0x66 Pseudo opcode for internal use. */
+#define	ANYOFHr               	24	/* 0x18 Like ANYOFH, but the flags field contains packed bounds for all matchable UTF-8 start bytes. */
+#define	ANYOFM                	25	/* 0x19 Like ANYOF, but matches an invariant byte as determined by the mask and arg */
+#define	NANYOFM               	26	/* 0x1a complement of ANYOFM */
+#define	POSIXD                	27	/* 0x1b Some [[:class:]] under /d; the FLAGS field gives which one */
+#define	POSIXL                	28	/* 0x1c Some [[:class:]] under /l; the FLAGS field gives which one */
+#define	POSIXU                	29	/* 0x1d Some [[:class:]] under /u; the FLAGS field gives which one */
+#define	POSIXA                	30	/* 0x1e Some [[:class:]] under /a; the FLAGS field gives which one */
+#define	NPOSIXD               	31	/* 0x1f complement of POSIXD, [[:^class:]] */
+#define	NPOSIXL               	32	/* 0x20 complement of POSIXL, [[:^class:]] */
+#define	NPOSIXU               	33	/* 0x21 complement of POSIXU, [[:^class:]] */
+#define	NPOSIXA               	34	/* 0x22 complement of POSIXA, [[:^class:]] */
+#define	CLUMP                 	35	/* 0x23 Match any extended grapheme cluster sequence */
+#define	BRANCH                	36	/* 0x24 Match this alternative, or the next... */
+#define	EXACT                 	37	/* 0x25 Match this string (preceded by length). */
+#define	EXACTL                	38	/* 0x26 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for). */
+#define	EXACTF                	39	/* 0x27 Match this string using /id rules (w/len); (string not UTF-8, not guaranteed to be folded). */
+#define	EXACTFL               	40	/* 0x28 Match this string using /il rules (w/len); (string not guaranteed to be folded). */
+#define	EXACTFU               	41	/* 0x29 Match this string using /iu rules (w/len); (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
+#define	EXACTFAA              	42	/* 0x2a Match this string using /iaa rules (w/len) (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
+#define	EXACTFUP              	43	/* 0x2b Match this string using /iu rules (w/len); (string not UTF-8, not guaranteed to be folded; and its Problematic). */
+#define	EXACTFLU8             	44	/* 0x2c Like EXACTFU, but use /il, UTF-8, folded, and everything in it is above 255. */
+#define	EXACTFAA_NO_TRIE      	45	/* 0x2d Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able). */
+#define	EXACT_ONLY8           	46	/* 0x2e Like EXACT, but only UTF-8 encoded targets can match */
+#define	EXACTFU_ONLY8         	47	/* 0x2f Like EXACTFU, but only UTF-8 encoded targets can match */
+#define	EXACTFU_S_EDGE        	48	/* 0x30 /di rules, but nothing in it precludes /ui, except begins and/or ends with [Ss]; (string not UTF-8; compile-time only). */
+#define	NOTHING               	49	/* 0x31 Match empty string. */
+#define	TAIL                  	50	/* 0x32 Match empty string. Can jump here from outside. */
+#define	STAR                  	51	/* 0x33 Match this (simple) thing 0 or more times. */
+#define	PLUS                  	52	/* 0x34 Match this (simple) thing 1 or more times. */
+#define	CURLY                 	53	/* 0x35 Match this simple thing {n,m} times. */
+#define	CURLYN                	54	/* 0x36 Capture next-after-this simple thing */
+#define	CURLYM                	55	/* 0x37 Capture this medium-complex thing {n,m} times. */
+#define	CURLYX                	56	/* 0x38 Match this complex thing {n,m} times. */
+#define	WHILEM                	57	/* 0x39 Do curly processing and see if rest matches. */
+#define	OPEN                  	58	/* 0x3a Mark this point in input as start of #n. */
+#define	CLOSE                 	59	/* 0x3b Close corresponding OPEN of #n. */
+#define	SROPEN                	60	/* 0x3c Same as OPEN, but for script run */
+#define	SRCLOSE               	61	/* 0x3d Close preceding SROPEN */
+#define	REF                   	62	/* 0x3e Match some already matched string */
+#define	REFF                  	63	/* 0x3f Match already matched string, using /di rules. */
+#define	REFFL                 	64	/* 0x40 Match already matched string, using /li rules. */
+#define	REFFU                 	65	/* 0x41 Match already matched string, usng /ui. */
+#define	REFFA                 	66	/* 0x42 Match already matched string, using /aai rules. */
+#define	REFN                  	67	/* 0x43 Match some already matched string */
+#define	REFFN                 	68	/* 0x44 Match already matched string, using /di rules. */
+#define	REFFLN                	69	/* 0x45 Match already matched string, using /li rules. */
+#define	REFFUN                	70	/* 0x46 Match already matched string, using /ui rules. */
+#define	REFFAN                	71	/* 0x47 Match already matched string, using /aai rules. */
+#define	LONGJMP               	72	/* 0x48 Jump far away. */
+#define	BRANCHJ               	73	/* 0x49 BRANCH with long offset. */
+#define	IFMATCH               	74	/* 0x4a Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
+#define	UNLESSM               	75	/* 0x4b Fails if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
+#define	SUSPEND               	76	/* 0x4c "Independent" sub-RE. */
+#define	IFTHEN                	77	/* 0x4d Switch, should be preceded by switcher. */
+#define	GROUPP                	78	/* 0x4e Whether the group matched. */
+#define	EVAL                  	79	/* 0x4f Execute some Perl code. */
+#define	MINMOD                	80	/* 0x50 Next operator is not greedy. */
+#define	LOGICAL               	81	/* 0x51 Next opcode should set the flag only. */
+#define	RENUM                 	82	/* 0x52 Group with independently numbered parens. */
+#define	TRIE                  	83	/* 0x53 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define	TRIEC                 	84	/* 0x54 Same as TRIE, but with embedded charclass data */
+#define	AHOCORASICK           	85	/* 0x55 Aho Corasick stclass. flags==type */
+#define	AHOCORASICKC          	86	/* 0x56 Same as AHOCORASICK, but with embedded charclass data */
+#define	GOSUB                 	87	/* 0x57 recurse to paren arg1 at (signed) ofs arg2 */
+#define	GROUPPN               	88	/* 0x58 Whether the group matched. */
+#define	INSUBP                	89	/* 0x59 Whether we are in a specific recurse. */
+#define	DEFINEP               	90	/* 0x5a Never execute directly. */
+#define	ENDLIKE               	91	/* 0x5b Used only for the type field of verbs */
+#define	OPFAIL                	92	/* 0x5c Same as (?!), but with verb arg */
+#define	ACCEPT                	93	/* 0x5d Accepts the current matched string, with verbar */
+#define	VERB                  	94	/* 0x5e Used only for the type field of verbs */
+#define	PRUNE                 	95	/* 0x5f Pattern fails at this startpoint if no-backtracking through this */
+#define	MARKPOINT             	96	/* 0x60 Push the current location for rollback by cut. */
+#define	SKIP                  	97	/* 0x61 On failure skip forward (to the mark) before retrying */
+#define	COMMIT                	98	/* 0x62 Pattern fails outright if backtracking through this */
+#define	CUTGROUP              	99	/* 0x63 On failure go to the next alternation in the group */
+#define	KEEPS                 	100	/* 0x64 $& begins here. */
+#define	LNBREAK               	101	/* 0x65 generic newline pattern */
+#define	OPTIMIZED             	102	/* 0x66 Placeholder for dump. */
+#define	PSEUDO                	103	/* 0x67 Pseudo opcode for internal use. */
 	/* ------------ States ------------- */
 #define	TRIE_next             	(REGNODE_MAX + 1)	/* state for TRIE */
 #define	TRIE_next_fail        	(REGNODE_MAX + 2)	/* state for TRIE */
@@ -186,6 +187,7 @@ EXTCONST U8 PL_regkind[] = {
 	ANYOF,    	/* ANYOFPOSIXL            */
 	ANYOF,    	/* ANYOFH                 */
 	ANYOF,    	/* ANYOFHb                */
+	ANYOF,    	/* ANYOFHr                */
 	ANYOFM,   	/* ANYOFM                 */
 	ANYOFM,   	/* NANYOFM                */
 	POSIXD,   	/* POSIXD                 */
@@ -338,6 +340,7 @@ static const U8 regarglen[] = {
 	EXTRA_SIZE(struct regnode_charclass_posixl),	/* ANYOFPOSIXL  */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFH       */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFHb      */
+	EXTRA_SIZE(struct regnode_1),        	/* ANYOFHr      */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFM       */
 	EXTRA_SIZE(struct regnode_1),        	/* NANYOFM      */
 	0,                                   	/* POSIXD       */
@@ -446,6 +449,7 @@ static const char reg_off_by_arg[] = {
 	0,	/* ANYOFPOSIXL  */
 	0,	/* ANYOFH       */
 	0,	/* ANYOFHb      */
+	0,	/* ANYOFHr      */
 	0,	/* ANYOFM       */
 	0,	/* NANYOFM      */
 	0,	/* POSIXD       */
@@ -560,85 +564,86 @@ EXTCONST char * const PL_reg_name[] = {
 	"ANYOFPOSIXL",           	/* 0x15 */
 	"ANYOFH",                	/* 0x16 */
 	"ANYOFHb",               	/* 0x17 */
-	"ANYOFM",                	/* 0x18 */
-	"NANYOFM",               	/* 0x19 */
-	"POSIXD",                	/* 0x1a */
-	"POSIXL",                	/* 0x1b */
-	"POSIXU",                	/* 0x1c */
-	"POSIXA",                	/* 0x1d */
-	"NPOSIXD",               	/* 0x1e */
-	"NPOSIXL",               	/* 0x1f */
-	"NPOSIXU",               	/* 0x20 */
-	"NPOSIXA",               	/* 0x21 */
-	"CLUMP",                 	/* 0x22 */
-	"BRANCH",                	/* 0x23 */
-	"EXACT",                 	/* 0x24 */
-	"EXACTL",                	/* 0x25 */
-	"EXACTF",                	/* 0x26 */
-	"EXACTFL",               	/* 0x27 */
-	"EXACTFU",               	/* 0x28 */
-	"EXACTFAA",              	/* 0x29 */
-	"EXACTFUP",              	/* 0x2a */
-	"EXACTFLU8",             	/* 0x2b */
-	"EXACTFAA_NO_TRIE",      	/* 0x2c */
-	"EXACT_ONLY8",           	/* 0x2d */
-	"EXACTFU_ONLY8",         	/* 0x2e */
-	"EXACTFU_S_EDGE",        	/* 0x2f */
-	"NOTHING",               	/* 0x30 */
-	"TAIL",                  	/* 0x31 */
-	"STAR",                  	/* 0x32 */
-	"PLUS",                  	/* 0x33 */
-	"CURLY",                 	/* 0x34 */
-	"CURLYN",                	/* 0x35 */
-	"CURLYM",                	/* 0x36 */
-	"CURLYX",                	/* 0x37 */
-	"WHILEM",                	/* 0x38 */
-	"OPEN",                  	/* 0x39 */
-	"CLOSE",                 	/* 0x3a */
-	"SROPEN",                	/* 0x3b */
-	"SRCLOSE",               	/* 0x3c */
-	"REF",                   	/* 0x3d */
-	"REFF",                  	/* 0x3e */
-	"REFFL",                 	/* 0x3f */
-	"REFFU",                 	/* 0x40 */
-	"REFFA",                 	/* 0x41 */
-	"REFN",                  	/* 0x42 */
-	"REFFN",                 	/* 0x43 */
-	"REFFLN",                	/* 0x44 */
-	"REFFUN",                	/* 0x45 */
-	"REFFAN",                	/* 0x46 */
-	"LONGJMP",               	/* 0x47 */
-	"BRANCHJ",               	/* 0x48 */
-	"IFMATCH",               	/* 0x49 */
-	"UNLESSM",               	/* 0x4a */
-	"SUSPEND",               	/* 0x4b */
-	"IFTHEN",                	/* 0x4c */
-	"GROUPP",                	/* 0x4d */
-	"EVAL",                  	/* 0x4e */
-	"MINMOD",                	/* 0x4f */
-	"LOGICAL",               	/* 0x50 */
-	"RENUM",                 	/* 0x51 */
-	"TRIE",                  	/* 0x52 */
-	"TRIEC",                 	/* 0x53 */
-	"AHOCORASICK",           	/* 0x54 */
-	"AHOCORASICKC",          	/* 0x55 */
-	"GOSUB",                 	/* 0x56 */
-	"GROUPPN",               	/* 0x57 */
-	"INSUBP",                	/* 0x58 */
-	"DEFINEP",               	/* 0x59 */
-	"ENDLIKE",               	/* 0x5a */
-	"OPFAIL",                	/* 0x5b */
-	"ACCEPT",                	/* 0x5c */
-	"VERB",                  	/* 0x5d */
-	"PRUNE",                 	/* 0x5e */
-	"MARKPOINT",             	/* 0x5f */
-	"SKIP",                  	/* 0x60 */
-	"COMMIT",                	/* 0x61 */
-	"CUTGROUP",              	/* 0x62 */
-	"KEEPS",                 	/* 0x63 */
-	"LNBREAK",               	/* 0x64 */
-	"OPTIMIZED",             	/* 0x65 */
-	"PSEUDO",                	/* 0x66 */
+	"ANYOFHr",               	/* 0x18 */
+	"ANYOFM",                	/* 0x19 */
+	"NANYOFM",               	/* 0x1a */
+	"POSIXD",                	/* 0x1b */
+	"POSIXL",                	/* 0x1c */
+	"POSIXU",                	/* 0x1d */
+	"POSIXA",                	/* 0x1e */
+	"NPOSIXD",               	/* 0x1f */
+	"NPOSIXL",               	/* 0x20 */
+	"NPOSIXU",               	/* 0x21 */
+	"NPOSIXA",               	/* 0x22 */
+	"CLUMP",                 	/* 0x23 */
+	"BRANCH",                	/* 0x24 */
+	"EXACT",                 	/* 0x25 */
+	"EXACTL",                	/* 0x26 */
+	"EXACTF",                	/* 0x27 */
+	"EXACTFL",               	/* 0x28 */
+	"EXACTFU",               	/* 0x29 */
+	"EXACTFAA",              	/* 0x2a */
+	"EXACTFUP",              	/* 0x2b */
+	"EXACTFLU8",             	/* 0x2c */
+	"EXACTFAA_NO_TRIE",      	/* 0x2d */
+	"EXACT_ONLY8",           	/* 0x2e */
+	"EXACTFU_ONLY8",         	/* 0x2f */
+	"EXACTFU_S_EDGE",        	/* 0x30 */
+	"NOTHING",               	/* 0x31 */
+	"TAIL",                  	/* 0x32 */
+	"STAR",                  	/* 0x33 */
+	"PLUS",                  	/* 0x34 */
+	"CURLY",                 	/* 0x35 */
+	"CURLYN",                	/* 0x36 */
+	"CURLYM",                	/* 0x37 */
+	"CURLYX",                	/* 0x38 */
+	"WHILEM",                	/* 0x39 */
+	"OPEN",                  	/* 0x3a */
+	"CLOSE",                 	/* 0x3b */
+	"SROPEN",                	/* 0x3c */
+	"SRCLOSE",               	/* 0x3d */
+	"REF",                   	/* 0x3e */
+	"REFF",                  	/* 0x3f */
+	"REFFL",                 	/* 0x40 */
+	"REFFU",                 	/* 0x41 */
+	"REFFA",                 	/* 0x42 */
+	"REFN",                  	/* 0x43 */
+	"REFFN",                 	/* 0x44 */
+	"REFFLN",                	/* 0x45 */
+	"REFFUN",                	/* 0x46 */
+	"REFFAN",                	/* 0x47 */
+	"LONGJMP",               	/* 0x48 */
+	"BRANCHJ",               	/* 0x49 */
+	"IFMATCH",               	/* 0x4a */
+	"UNLESSM",               	/* 0x4b */
+	"SUSPEND",               	/* 0x4c */
+	"IFTHEN",                	/* 0x4d */
+	"GROUPP",                	/* 0x4e */
+	"EVAL",                  	/* 0x4f */
+	"MINMOD",                	/* 0x50 */
+	"LOGICAL",               	/* 0x51 */
+	"RENUM",                 	/* 0x52 */
+	"TRIE",                  	/* 0x53 */
+	"TRIEC",                 	/* 0x54 */
+	"AHOCORASICK",           	/* 0x55 */
+	"AHOCORASICKC",          	/* 0x56 */
+	"GOSUB",                 	/* 0x57 */
+	"GROUPPN",               	/* 0x58 */
+	"INSUBP",                	/* 0x59 */
+	"DEFINEP",               	/* 0x5a */
+	"ENDLIKE",               	/* 0x5b */
+	"OPFAIL",                	/* 0x5c */
+	"ACCEPT",                	/* 0x5d */
+	"VERB",                  	/* 0x5e */
+	"PRUNE",                 	/* 0x5f */
+	"MARKPOINT",             	/* 0x60 */
+	"SKIP",                  	/* 0x61 */
+	"COMMIT",                	/* 0x62 */
+	"CUTGROUP",              	/* 0x63 */
+	"KEEPS",                 	/* 0x64 */
+	"LNBREAK",               	/* 0x65 */
+	"OPTIMIZED",             	/* 0x66 */
+	"PSEUDO",                	/* 0x67 */
 	/* ------------ States ------------- */
 	"TRIE_next",             	/* REGNODE_MAX +0x01 */
 	"TRIE_next_fail",        	/* REGNODE_MAX +0x02 */
@@ -773,7 +778,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0xFC, 0xE1, 0x7F, 0x19, 0x00, 0x00, 0x00
+    0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0xF8, 0xC3, 0xFF, 0x32, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
@@ -786,8 +791,8 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__;
 #else
 EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
     REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, ANYOFPOSIXL, ANYOFH, ANYOFHb,
-    ANYOFM, NANYOFM, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD, NPOSIXL,
-    NPOSIXU, NPOSIXA,
+    ANYOFHr, ANYOFM, NANYOFM, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD,
+    NPOSIXL, NPOSIXU, NPOSIXA,
     0
 };
 #endif /* DOINIT */
@@ -796,7 +801,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0x00, 0xFF, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    0x00, 0x00, 0xFF, 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
-- 
1.8.3.1