This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Keep separate list for [:word:] from \p{Word}
authorKarl Williamson <public@khwilliamson.com>
Wed, 18 Jul 2012 20:06:31 +0000 (14:06 -0600)
committerKarl Williamson <public@khwilliamson.com>
Thu, 19 Jul 2012 15:39:07 +0000 (09:39 -0600)
This commit separates out the building up of code points that match the
posix properties (including \w, \d, \s) from those that match Unicode
properties.  This will prove useful in future commits

regcomp.c

index e5054cc..2499435 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10975,9 +10975,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     SV *listsv = NULL;
     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                      than just initialized.  */
-    SV* properties = NULL;    /* Code points that match \p{} \P{}, or that come
-                                 from e.g., [:word:], extended beyond the
-                                 Latin1 range */
+    SV* properties = NULL;    /* Code points that match \p{} \P{} */
+    SV* posixes = NULL;     /* Code points that match classes like, [:word:],
+                               extended beyond the Latin1 range */
     UV element_count = 0;   /* Number of distinct elements in the class.
                               Optimizations may be possible if this is tiny */
     UV n;
@@ -11427,19 +11427,19 @@ parseit:
                switch ((I32)namedclass) {
 
                case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
                    break;
                case ANYOF_NALNUMC:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
                    break;
                case ANYOF_ALPHA:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
                    break;
                case ANYOF_NALPHA:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
                    break;
                case ANYOF_ASCII:
@@ -11447,7 +11447,7 @@ parseit:
                        ANYOF_CLASS_SET(ret, namedclass);
                    }
                     else {
-                        _invlist_union(properties, PL_ASCII, &properties);
+                        _invlist_union(posixes, PL_ASCII, &posixes);
                     }
                    break;
                case ANYOF_NASCII:
@@ -11455,48 +11455,48 @@ parseit:
                        ANYOF_CLASS_SET(ret, namedclass);
                    }
                     else {
-                        _invlist_union_complement_2nd(properties,
-                                                    PL_ASCII, &properties);
+                        _invlist_union_complement_2nd(posixes,
+                                                    PL_ASCII, &posixes);
                         if (DEPENDS_SEMANTICS) {
                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                         }
                     }
                    break;
                case ANYOF_BLANK:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                             PL_PosixBlank, PL_XPosixBlank);
                    break;
                case ANYOF_NBLANK:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                             PL_PosixBlank, PL_XPosixBlank);
                    break;
                case ANYOF_CNTRL:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                             PL_PosixCntrl, PL_XPosixCntrl);
                    break;
                case ANYOF_NCNTRL:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                             PL_PosixCntrl, PL_XPosixCntrl);
                    break;
                case ANYOF_DIGIT:
                    /* There are no digits in the Latin1 range outside of
                     * ASCII, so call the macro that doesn't have to resolve
                     * them */
-                   DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
                         PL_PosixDigit, "XPosixDigit", listsv);
                     has_special_charset_op = TRUE;
                    break;
                case ANYOF_NDIGIT:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
                     has_special_charset_op = TRUE;
                    break;
                case ANYOF_GRAPH:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
                    break;
                case ANYOF_NGRAPH:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
                    break;
                case ANYOF_HORIZWS:
@@ -11534,46 +11534,46 @@ parseit:
                        Xname = "XPosixLower";
                    }
                    if (namedclass == ANYOF_LOWER) {
-                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                                     ascii_source, l1_source, Xname, listsv);
                    }
                    else {
                        DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
-                            properties, ascii_source, l1_source, Xname, listsv);
+                            posixes, ascii_source, l1_source, Xname, listsv);
                    }
                    break;
                }
                case ANYOF_PRINT:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
                    break;
                case ANYOF_NPRINT:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
                    break;
                case ANYOF_PUNCT:
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
                    break;
                case ANYOF_NPUNCT:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
                    break;
                case ANYOF_PSXSPC:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                             PL_PosixSpace, PL_XPosixSpace);
                    break;
                case ANYOF_NPSXSPC:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                             PL_PosixSpace, PL_XPosixSpace);
                    break;
                case ANYOF_SPACE:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                             PL_PerlSpace, PL_XPerlSpace);
                     has_special_charset_op = TRUE;
                    break;
                case ANYOF_NSPACE:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                             PL_PerlSpace, PL_XPerlSpace);
                     has_special_charset_op = TRUE;
                    break;
@@ -11595,22 +11595,22 @@ parseit:
                        Xname = "XPosixUpper";
                    }
                    if (namedclass == ANYOF_UPPER) {
-                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                       DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                                     ascii_source, l1_source, Xname, listsv);
                    }
                    else {
                        DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
-                        properties, ascii_source, l1_source, Xname, listsv);
+                        posixes, ascii_source, l1_source, Xname, listsv);
                    }
                    break;
                }
                case ANYOF_ALNUM:   /* Really is 'Word' */
-                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
                     has_special_charset_op = TRUE;
                    break;
                case ANYOF_NALNUM:
-                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+                   DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
                     has_special_charset_op = TRUE;
                    break;
@@ -11628,11 +11628,11 @@ parseit:
                     has_special_non_charset_op = TRUE;
                    break;
                case ANYOF_XDIGIT:
-                    DO_POSIX(ret, namedclass, properties,
+                    DO_POSIX(ret, namedclass, posixes,
                                             PL_PosixXDigit, PL_XPosixXDigit);
                    break;
                case ANYOF_NXDIGIT:
-                    DO_N_POSIX(ret, namedclass, properties,
+                    DO_N_POSIX(ret, namedclass, posixes,
                                             PL_PosixXDigit, PL_XPosixXDigit);
                    break;
                case ANYOF_MAX:
@@ -12117,17 +12117,17 @@ parseit:
        SvREFCNT_dec(fold_intersection);
     }
 
-    /* And combine the result (if any) with any inversion list from properties.
-     * The lists are kept separate up to now because we don't want to fold the
-     * properties */
-    if (properties) {
+    /* And combine the result (if any) with any inversion list from posix
+     * classes.  The lists are kept separate up to now because we don't want to
+     * fold the classes */
+    if (posixes) {
         if (AT_LEAST_UNI_SEMANTICS) {
             if (cp_list) {
-                _invlist_union(cp_list, properties, &cp_list);
-                SvREFCNT_dec(properties);
+                _invlist_union(cp_list, posixes, &cp_list);
+                SvREFCNT_dec(posixes);
             }
             else {
-                cp_list = properties;
+                cp_list = posixes;
             }
         }
         else {
@@ -12135,18 +12135,18 @@ parseit:
             /* Under /d, we put into a separate list the Latin1 things that
              * match only when the target string is utf8 */
             SV* nonascii_but_latin1_properties = NULL;
-            _invlist_intersection(properties, PL_Latin1,
+            _invlist_intersection(posixes, PL_Latin1,
                                   &nonascii_but_latin1_properties);
             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
                               &nonascii_but_latin1_properties);
-            _invlist_subtract(properties, nonascii_but_latin1_properties,
-                              &properties);
+            _invlist_subtract(posixes, nonascii_but_latin1_properties,
+                              &posixes);
             if (cp_list) {
-                _invlist_union(cp_list, properties, &cp_list);
-                SvREFCNT_dec(properties);
+                _invlist_union(cp_list, posixes, &cp_list);
+                SvREFCNT_dec(posixes);
             }
             else {
-                cp_list = properties;
+                cp_list = posixes;
             }
 
             if (depends_list) {
@@ -12160,6 +12160,20 @@ parseit:
         }
     }
 
+    /* And combine the result (if any) with any inversion list from properties.
+     * (Note that in this case, unlike the Posix one above, there is no
+     * <depends_list>, because having a Unicode property forces Unicode
+     * semantics */
+    if (properties) {
+        if (cp_list) {
+            _invlist_union(cp_list, properties, &cp_list);
+            SvREFCNT_dec(properties);
+        }
+        else {
+            cp_list = properties;
+        }
+    }
+
     /* Here, we have calculated what code points should be in the character
      * class.
      *