regcomp.c: Correct outdated comment

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index cfed452..adeec44 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -403,9 +403,13 @@ static const scan_data_t zero_scan_data =
  
  #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
  
-#define OOB_UNICODE            12345678
  #define OOB_NAMEDCLASS         -1
  
+/* There is no code point that is out-of-bounds, so this is problematic.  But
+ * its only current use is to initialize a variable that is always set before
+ * looked at. */
+#define OOB_UNICODE            0xDEADBEEF
+
  #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
  #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
  
@@ -2630,9 +2634,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   *      problematic sequences.  This delta is used by the caller to adjust the
   *      min length of the match, and the delta between min and max, so that the
   *      optimizer doesn't reject these possibilities based on size constraints.
- * 2)   These sequences are not currently correctly handled by the trie code
- *      either, so it changes the joined node type to ops that are not handled
- *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
+ * 2)   These sequences require special handling by the trie code, so it
+ *      changes the joined node type to ops for the trie's benefit, those new
+ *      ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
   * 3)   This is sufficient for the two Greek sequences (described below), but
   *      the one involving the Sharp s (\xDF) needs more.  The node type
   *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -4404,6 +4408,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             data->flags |= (OP(scan) == MEOL
                             ? SF_BEFORE_MEOL
                             : SF_BEFORE_SEOL);
+           SCAN_COMMIT(pRExC_state, data, minlenp);
+
         }
         else if (  PL_regkind[OP(scan)] == BRANCHJ
                  /* Lookbehind, or need to calculate parens/evals/stclass: */
@@ -5025,7 +5031,7 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
   * the original pattern needs upgrading to utf8.
   */
  
-bool
+static bool
  S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
      char *pat, STRLEN plen)
  {
@@ -8093,6 +8099,36 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
      return TRUE;
  }
  
+PERL_STATIC_INLINE UV
+S_invlist_highest(pTHX_ SV* const invlist)
+{
+    /* Returns the highest code point that matches an inversion list.  This API
+     * has an ambiguity, as it returns 0 under either the highest is actually
+     * 0, or if the list is empty.  If this distinction matters to you, check
+     * for emptiness before calling this function */
+
+    UV len = invlist_len(invlist);
+    UV *array;
+
+    PERL_ARGS_ASSERT_INVLIST_HIGHEST;
+
+    if (len == 0) {
+       return 0;
+    }
+
+    array = invlist_array(invlist);
+
+    /* The last element in the array in the inversion list always starts a
+     * range that goes to infinity.  That range may be for code points that are
+     * matched in the inversion list, or it may be for ones that aren't
+     * matched.  In the latter case, the highest code point in the set is one
+     * less than the beginning of this range; otherwise it is the final element
+     * of this range: infinity */
+    return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
+           ? UV_MAX
+           : array[len - 1] - 1;
+}
+
  #ifndef PERL_IN_XSUB_RE
  SV *
  Perl__invlist_contents(pTHX_ SV* const invlist)
@@ -10904,16 +10940,18 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
      }
  
  /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
- * this and DO_N_POSIX */
+ * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
+ * otherwise */
  #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
-                              l1_sourcelist, Xpropertyname, run_time_list) \
+       l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
      if (AT_LEAST_ASCII_RESTRICTED) {                                       \
          _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
      }                                                                      \
      else {                                                                 \
          Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+        matches_above_unicode = TRUE;                                      \
         if (LOC) {                                                         \
-           ANYOF_CLASS_SET(node, namedclass);                             \
+            ANYOF_CLASS_SET(node, namedclass);                            \
         }                                                                  \
         else {                                                             \
              SV* scratch_list = NULL;                                       \
@@ -10966,7 +11004,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
      register regnode *ret;
      STRLEN numlen;
-    IV namedclass;
+    IV namedclass = OOB_NAMEDCLASS;
      char *rangebegin = NULL;
      bool need_class = 0;
      bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
@@ -10974,6 +11012,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                       than just initialized.  */
      SV* properties = NULL;    /* Code points that match \p{} \P{} */
+    SV* posixes = NULL;     /* Code points that match classes like, [:word:],
+                               extended beyond the Latin1 range */
      UV element_count = 0;   /* Number of distinct elements in the class.
                                Optimizations may be possible if this is tiny */
      UV n;
@@ -10995,7 +11035,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
  
      /* Set if a component of this character class is user-defined; just passed
       * on to the engine */
-    UV has_user_defined_property = 0;
+    bool has_user_defined_property = FALSE;
  
      /* inversion list of code points this node matches only when the target
       * string is in UTF-8.  (Because is under /d) */
@@ -11014,6 +11054,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      UV literal_endpoint = 0;
  #endif
      UV stored = 0;  /* how many chars stored in the bitmap */
+    bool invert = FALSE;    /* Is this class to be complemented */
+
+    /* Is there any thing like \W or [:^digit:] that matches above the legal
+     * Unicode range? */
+    bool runtime_posix_matches_above_Unicode = FALSE;
  
      regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
          case we need to change the emitted regop to an EXACT. */
@@ -11038,8 +11083,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
      if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
         RExC_naughty++;
         RExC_parse++;
-       if (!SIZE_ONLY)
-           ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+        invert = TRUE;
  
          /* We have decided to not allow multi-char folds in inverted character
          * classes, due to the confusion that can happen, especially with
@@ -11164,6 +11208,7 @@ parseit:
                      SV** invlistsvp;
                      SV* invlist;
                      char* name;
+
                     if (UCHARAT(RExC_parse) == '^') {
                          RExC_parse++;
                          n--;
@@ -11218,7 +11263,7 @@ parseit:
                          Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
                                          (value == 'p' ? '+' : '!'),
                                          name);
-                        has_user_defined_property = 1;
+                        has_user_defined_property = TRUE;
  
                          /* We don't know yet, so have to assume that the
                           * property could match something in the Latin1 range,
@@ -11230,17 +11275,14 @@ parseit:
                          /* Here, did get the swash and its inversion list.  If
                           * the swash is from a user-defined property, then this
                           * whole character class should be regarded as such */
-                        SV** user_defined_svp =
-                                            hv_fetchs(MUTABLE_HV(SvRV(swash)),
-                                                        "USER_DEFINED", FALSE);
-                        if (user_defined_svp) {
-                            has_user_defined_property
-                                                    |= SvUV(*user_defined_svp);
-                        }
+                        has_user_defined_property =
+                                                _is_swash_user_defined(swash);
  
                          /* Invert if asking for the complement */
                          if (value == 'P') {
-                           _invlist_union_complement_2nd(properties, invlist, &properties);
+                           _invlist_union_complement_2nd(properties,
+                                                          invlist,
+                                                          &properties);
  
                              /* The swash can't be used as-is, because we've
                              * inverted things; delay removing it to here after
@@ -11342,15 +11384,20 @@ parseit:
             literal_endpoint++;
  #endif
  
-       if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
-
-           /* What matches in a locale is not known until runtime, so need to
-            * (one time per class) allocate extra space to pass to regexec.
-            * The space will contain a bit for each named class that is to be
-            * matched against.  This isn't needed for \p{} and pseudo-classes,
-            * as they are not affected by locale, and hence are dealt with
-            * separately */
-           if (LOC && namedclass < ANYOF_MAX && ! need_class) {
+            /* What matches in a locale is not known until runtime.  This
+             * includes what the Posix classes (like \w, [:space:]) match.
+             * Room must be reserved (one time per class) to store such
+             * classes, either if Perl is compiled so that locale nodes always
+             * should have this space, or if there is such class info to be
+             * stored.  The space will contain a bit for each named class that
+             * is to be matched against.  This isn't needed for \p{} and
+             * pseudo-classes, as they are not affected by locale, and hence
+             * are dealt with separately */
+           if (LOC
+                && ! need_class
+                && (ANYOF_LOCALE == ANYOF_CLASS
+                    || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
+            {
                 need_class = 1;
                 if (SIZE_ONLY) {
                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
@@ -11362,6 +11409,8 @@ parseit:
                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
             }
  
+       if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
              * literal, as is the character that began the false range, i.e.
              * the 'a' in the examples */
@@ -11419,27 +11468,29 @@ parseit:
                 switch ((I32)namedclass) {
  
                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                          PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
                     break;
                 case ANYOF_NALNUMC:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
+                        runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_ALPHA:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                          PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
                     break;
                 case ANYOF_NALPHA:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
+                        runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_ASCII:
                     if (LOC) {
                         ANYOF_CLASS_SET(ret, namedclass);
                     }
                      else {
-                        _invlist_union(properties, PL_ASCII, &properties);
+                        _invlist_union(posixes, PL_ASCII, &posixes);
                      }
                     break;
                 case ANYOF_NASCII:
@@ -11447,49 +11498,51 @@ parseit:
                         ANYOF_CLASS_SET(ret, namedclass);
                     }
                      else {
-                        _invlist_union_complement_2nd(properties,
-                                                    PL_ASCII, &properties);
+                        _invlist_union_complement_2nd(posixes,
+                                                    PL_ASCII, &posixes);
                          if (DEPENDS_SEMANTICS) {
                              ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                          }
                      }
                     break;
                 case ANYOF_BLANK:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                              PL_PosixBlank, PL_XPosixBlank);
                     break;
                 case ANYOF_NBLANK:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                              PL_PosixBlank, PL_XPosixBlank);
                     break;
                 case ANYOF_CNTRL:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                              PL_PosixCntrl, PL_XPosixCntrl);
                     break;
                 case ANYOF_NCNTRL:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                              PL_PosixCntrl, PL_XPosixCntrl);
                     break;
                 case ANYOF_DIGIT:
                     /* There are no digits in the Latin1 range outside of
                      * ASCII, so call the macro that doesn't have to resolve
                      * them */
-                   DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
                          PL_PosixDigit, "XPosixDigit", listsv);
                      has_special_charset_op = TRUE;
                     break;
                 case ANYOF_NDIGIT:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                        PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                        PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
+                        runtime_posix_matches_above_Unicode);
                      has_special_charset_op = TRUE;
                     break;
                 case ANYOF_GRAPH:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                          PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
                     break;
                 case ANYOF_NGRAPH:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
+                        runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_HORIZWS:
                     /* For these, we use the cp_list, as /d doesn't make a
@@ -11526,46 +11579,49 @@ parseit:
                         Xname = "XPosixLower";
                     }
                     if (namedclass == ANYOF_LOWER) {
-                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                                      ascii_source, l1_source, Xname, listsv);
                     }
                     else {
                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
-                            properties, ascii_source, l1_source, Xname, listsv);
+                            posixes, ascii_source, l1_source, Xname, listsv,
+                            runtime_posix_matches_above_Unicode);
                     }
                     break;
                 }
                 case ANYOF_PRINT:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                          PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
                     break;
                 case ANYOF_NPRINT:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
+                        runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_PUNCT:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                          PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
                     break;
                 case ANYOF_NPUNCT:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
+                        runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_PSXSPC:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                              PL_PosixSpace, PL_XPosixSpace);
                     break;
                 case ANYOF_NPSXSPC:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                              PL_PosixSpace, PL_XPosixSpace);
                     break;
                 case ANYOF_SPACE:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                              PL_PerlSpace, PL_XPerlSpace);
                      has_special_charset_op = TRUE;
                     break;
                 case ANYOF_NSPACE:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                              PL_PerlSpace, PL_XPerlSpace);
                      has_special_charset_op = TRUE;
                     break;
@@ -11587,23 +11643,25 @@ parseit:
                         Xname = "XPosixUpper";
                     }
                     if (namedclass == ANYOF_UPPER) {
-                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                                      ascii_source, l1_source, Xname, listsv);
                     }
                     else {
                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
-                        properties, ascii_source, l1_source, Xname, listsv);
+                        posixes, ascii_source, l1_source, Xname, listsv,
+                        runtime_posix_matches_above_Unicode);
                     }
                     break;
                 }
                 case ANYOF_ALNUM:   /* Really is 'Word' */
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                              PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
                      has_special_charset_op = TRUE;
                     break;
                 case ANYOF_NALNUM:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
-                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
+                            runtime_posix_matches_above_Unicode);
                      has_special_charset_op = TRUE;
                     break;
                 case ANYOF_VERTWS:
@@ -11620,11 +11678,11 @@ parseit:
                      has_special_non_charset_op = TRUE;
                     break;
                 case ANYOF_XDIGIT:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                              PL_PosixXDigit, PL_XPosixXDigit);
                     break;
                 case ANYOF_NXDIGIT:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                              PL_PosixXDigit, PL_XPosixXDigit);
                     break;
                 case ANYOF_MAX:
@@ -11718,7 +11776,6 @@ parseit:
           || (prevvalue == '0' && value == '9')))
      {
          U8 op;
-        bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
          const char * cur_parse = RExC_parse;
  
          if (has_special_charset_op) {
@@ -11815,19 +11872,11 @@ parseit:
  
         SV* fold_intersection = NULL;
  
-        const UV highest_index = invlist_len(cp_list) - 1;
-
          /* In the Latin1 range, the characters that can be folded-to or -from
           * are precisely the alphabetic characters.  If the highest code point
           * is within Latin1, we can use the compiled-in list, and not have to
-         * go out to disk.  If the last element in the array is in the
-         * inversion list set, it starts a range that goes to infinity, so the
-         * maximum of the inversion list is definitely above Latin1.
-         * Otherwise, it starts a range that isn't in the set, so the max is
-         * one less than it */
-        if (! ELEMENT_RANGE_MATCHES_INVLIST(highest_index)
-            && invlist_array(cp_list)[highest_index] <= 256)
-        {
+         * go out to disk. */
+        if (invlist_highest(cp_list) < 256) {
              _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
          }
          else {
@@ -11950,15 +11999,13 @@ parseit:
                          switch (j) {
                              case 'k':
                              case 'K':
-                                /* KELVIN SIGN */
                                  cp_list =
-                                    add_cp_to_invlist(cp_list, 0x212A);
+                                    add_cp_to_invlist(cp_list, KELVIN_SIGN);
                                  break;
                              case 's':
                              case 'S':
-                                /* LATIN SMALL LETTER LONG S */
-                                cp_list =
-                                    add_cp_to_invlist(cp_list, 0x017F);
+                                cp_list = add_cp_to_invlist(cp_list,
+                                                    LATIN_SMALL_LETTER_LONG_S);
                                  break;
                              case MICRO_SIGN:
                                  cp_list = add_cp_to_invlist(cp_list,
@@ -11968,9 +12015,8 @@ parseit:
                                  break;
                              case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
                              case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
-                                /* ANGSTROM SIGN */
                                  cp_list =
-                                        add_cp_to_invlist(cp_list, 0x212B);
+                                    add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
                                  break;
                              case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
                                  cp_list = add_cp_to_invlist(cp_list,
@@ -12109,36 +12155,36 @@ parseit:
         SvREFCNT_dec(fold_intersection);
      }
  
-    /* And combine the result (if any) with any inversion list from properties.
-     * The lists are kept separate up to now because we don't want to fold the
-     * properties */
-    if (properties) {
+    /* And combine the result (if any) with any inversion list from posix
+     * classes.  The lists are kept separate up to now because we don't want to
+     * fold the classes */
+    if (posixes) {
          if (AT_LEAST_UNI_SEMANTICS) {
              if (cp_list) {
-                _invlist_union(cp_list, properties, &cp_list);
-                SvREFCNT_dec(properties);
+                _invlist_union(cp_list, posixes, &cp_list);
+                SvREFCNT_dec(posixes);
              }
              else {
-                cp_list = properties;
+                cp_list = posixes;
              }
          }
          else {
  
-            /* Under /d, we put the things that match only when the target
-             * string is utf8, into a separate list */
+            /* Under /d, we put into a separate list the Latin1 things that
+             * match only when the target string is utf8 */
              SV* nonascii_but_latin1_properties = NULL;
-            _invlist_intersection(properties, PL_Latin1,
+            _invlist_intersection(posixes, PL_Latin1,
                                    &nonascii_but_latin1_properties);
              _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
                                &nonascii_but_latin1_properties);
-            _invlist_subtract(properties, nonascii_but_latin1_properties,
-                              &properties);
+            _invlist_subtract(posixes, nonascii_but_latin1_properties,
+                              &posixes);
              if (cp_list) {
-                _invlist_union(cp_list, properties, &cp_list);
-                SvREFCNT_dec(properties);
+                _invlist_union(cp_list, posixes, &cp_list);
+                SvREFCNT_dec(posixes);
              }
              else {
-                cp_list = properties;
+                cp_list = posixes;
              }
  
              if (depends_list) {
@@ -12152,6 +12198,51 @@ parseit:
          }
      }
  
+    /* And combine the result (if any) with any inversion list from properties.
+     * The lists are kept separate up to now so that we can distinguish the two
+     * in regards to matching above-Unicode.  A run-time warning is generated
+     * if a Unicode property is matched against a non-Unicode code point. But,
+     * we allow user-defined properties to match anything, without any warning,
+     * and we also suppress the warning if there is a portion of the character
+     * class that isn't a Unicode property, and which matches above Unicode, \W
+     * or [\x{110000}] for example.
+     * (Note that in this case, unlike the Posix one above, there is no
+     * <depends_list>, because having a Unicode property forces Unicode
+     * semantics */
+    if (properties) {
+        bool warn_super = ! has_user_defined_property;
+        if (cp_list) {
+
+            /* If it matters to the final outcome, see if a non-property
+             * component of the class matches above Unicode.  If so, the
+             * warning gets suppressed.  This is true even if just a single
+             * such code point is specified, as though not strictly correct if
+             * another such code point is matched against, the fact that they
+             * are using above-Unicode code points indicates they should know
+             * the issues involved */
+            if (warn_super) {
+                bool non_prop_matches_above_Unicode =
+                            runtime_posix_matches_above_Unicode
+                            | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
+                if (invert) {
+                    non_prop_matches_above_Unicode =
+                                            !  non_prop_matches_above_Unicode;
+                }
+                warn_super = ! non_prop_matches_above_Unicode;
+            }
+
+            _invlist_union(properties, cp_list, &cp_list);
+            SvREFCNT_dec(properties);
+        }
+        else {
+            cp_list = properties;
+        }
+
+        if (warn_super) {
+            ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+        }
+    }
+
      /* Here, we have calculated what code points should be in the character
       * class.
       *
@@ -12165,7 +12256,7 @@ parseit:
       * optimize locale.  Doing so perhaps could be done as long as there is
       * nothing like \w in it; some thought also would have to be given to the
       * interaction with above 0x100 chars */
-    if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
+    if (invert
          && ! LOC
         && ! depends_list
         && ! unicode_alternate
@@ -12180,7 +12271,7 @@ parseit:
          }
  
         /* Clear the invert flag since have just done it here */
-       ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
+       invert = FALSE;
      }
  
      /* Here, <cp_list> contains all the code points we can determine at
@@ -12232,6 +12323,10 @@ parseit:
         }
      }
  
+    if (invert) {
+        ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+    }
+
      /* Combine the two lists into one. */
      if (depends_list) {
         if (cp_list) {
@@ -12376,9 +12471,9 @@ parseit:
          * av[2] stores the multicharacter foldings, used later in
          *       regexec.c:S_reginclass().
          * av[3] stores the cp_list inversion list for use in addition or
-        *       instead of av[0]; not used if av[1] isn't NULL
+        *       instead of av[0]; used only if av[1] is NULL
          * av[4] is set if any component of the class is from a user-defined
-        *       property; not used if av[1] isn't NULL */
+        *       property; used only if av[1] is NULL */
         AV * const av = newAV();
         SV *rv;