This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.h, regcomp.c: Use mnemonics for Unicode chars
[perl5.git] / regcomp.c
index 6aca8e3..e3e85c6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10936,16 +10936,18 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
     }
 
 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
- * this and DO_N_POSIX */
+ * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
+ * otherwise */
 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
-                              l1_sourcelist, Xpropertyname, run_time_list) \
+       l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
     }                                                                      \
     else {                                                                 \
         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+        matches_above_unicode = TRUE;                                      \
        if (LOC) {                                                         \
-           ANYOF_CLASS_SET(node, namedclass);                             \
+            ANYOF_CLASS_SET(node, namedclass);                            \
        }                                                                  \
        else {                                                             \
             SV* scratch_list = NULL;                                       \
@@ -11050,6 +11052,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     UV stored = 0;  /* how many chars stored in the bitmap */
     bool invert = FALSE;    /* Is this class to be complemented */
 
+    /* Is there any thing like \W or [:^digit:] that matches above the legal
+     * Unicode range? */
+    bool runtime_posix_matches_above_Unicode = FALSE;
+
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
     const char * orig_parse = RExC_parse;
@@ -11198,6 +11204,7 @@ parseit:
                     SV** invlistsvp;
                     SV* invlist;
                     char* name;
+
                    if (UCHARAT(RExC_parse) == '^') {
                         RExC_parse++;
                         n--;
@@ -11462,7 +11469,8 @@ parseit:
                    break;
                case ANYOF_NALNUMC:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
+                        runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_ALPHA:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -11470,7 +11478,8 @@ parseit:
                    break;
                case ANYOF_NALPHA:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
+                        runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_ASCII:
                    if (LOC) {
@@ -11518,7 +11527,8 @@ parseit:
                    break;
                case ANYOF_NDIGIT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+                        PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
+                        runtime_posix_matches_above_Unicode);
                     has_special_charset_op = TRUE;
                    break;
                case ANYOF_GRAPH:
@@ -11527,7 +11537,8 @@ parseit:
                    break;
                case ANYOF_NGRAPH:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
+                        runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_HORIZWS:
                    /* For these, we use the cp_list, as /d doesn't make a
@@ -11569,7 +11580,8 @@ parseit:
                    }
                    else {
                        DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
-                            posixes, ascii_source, l1_source, Xname, listsv);
+                            posixes, ascii_source, l1_source, Xname, listsv,
+                            runtime_posix_matches_above_Unicode);
                    }
                    break;
                }
@@ -11579,7 +11591,8 @@ parseit:
                    break;
                case ANYOF_NPRINT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
+                        runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_PUNCT:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -11587,7 +11600,8 @@ parseit:
                    break;
                case ANYOF_NPUNCT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
+                        runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_PSXSPC:
                     DO_POSIX(ret, namedclass, posixes,
@@ -11630,7 +11644,8 @@ parseit:
                    }
                    else {
                        DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
-                        posixes, ascii_source, l1_source, Xname, listsv);
+                        posixes, ascii_source, l1_source, Xname, listsv,
+                        runtime_posix_matches_above_Unicode);
                    }
                    break;
                }
@@ -11641,7 +11656,8 @@ parseit:
                    break;
                case ANYOF_NALNUM:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
+                            runtime_posix_matches_above_Unicode);
                     has_special_charset_op = TRUE;
                    break;
                case ANYOF_VERTWS:
@@ -11979,15 +11995,13 @@ parseit:
                         switch (j) {
                             case 'k':
                             case 'K':
-                                /* KELVIN SIGN */
                                 cp_list =
-                                    add_cp_to_invlist(cp_list, 0x212A);
+                                    add_cp_to_invlist(cp_list, KELVIN_SIGN);
                                 break;
                             case 's':
                             case 'S':
-                                /* LATIN SMALL LETTER LONG S */
-                                cp_list =
-                                    add_cp_to_invlist(cp_list, 0x017F);
+                                cp_list = add_cp_to_invlist(cp_list,
+                                                    LATIN_SMALL_LETTER_LONG_S);
                                 break;
                             case MICRO_SIGN:
                                 cp_list = add_cp_to_invlist(cp_list,
@@ -11997,9 +12011,8 @@ parseit:
                                 break;
                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
-                                /* ANGSTROM SIGN */
                                 cp_list =
-                                        add_cp_to_invlist(cp_list, 0x212B);
+                                    add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
                                 break;
                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
                                 cp_list = add_cp_to_invlist(cp_list,
@@ -12182,17 +12195,48 @@ parseit:
     }
 
     /* And combine the result (if any) with any inversion list from properties.
+     * The lists are kept separate up to now so that we can distinguish the two
+     * in regards to matching above-Unicode.  A run-time warning is generated
+     * if a Unicode property is matched against a non-Unicode code point. But,
+     * we allow user-defined properties to match anything, without any warning,
+     * and we also suppress the warning if there is a portion of the character
+     * class that isn't a Unicode property, and which matches above Unicode, \W
+     * or [\x{110000}] for example.
      * (Note that in this case, unlike the Posix one above, there is no
      * <depends_list>, because having a Unicode property forces Unicode
      * semantics */
     if (properties) {
+        bool warn_super = ! has_user_defined_property;
         if (cp_list) {
-            _invlist_union(cp_list, properties, &cp_list);
+
+            /* If it matters to the final outcome, see if a non-property
+             * component of the class matches above Unicode.  If so, the
+             * warning gets suppressed.  This is true even if just a single
+             * such code point is specified, as though not strictly correct if
+             * another such code point is matched against, the fact that they
+             * are using above-Unicode code points indicates they should know
+             * the issues involved */
+            if (warn_super) {
+                bool non_prop_matches_above_Unicode =
+                            runtime_posix_matches_above_Unicode
+                            | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
+                if (invert) {
+                    non_prop_matches_above_Unicode =
+                                            !  non_prop_matches_above_Unicode;
+                }
+                warn_super = ! non_prop_matches_above_Unicode;
+            }
+
+            _invlist_union(properties, cp_list, &cp_list);
             SvREFCNT_dec(properties);
         }
         else {
             cp_list = properties;
         }
+
+        if (warn_super) {
+            ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+        }
     }
 
     /* Here, we have calculated what code points should be in the character