This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Use values if known at compile time
authorKarl Williamson <public@khwilliamson.com>
Wed, 12 Dec 2012 18:35:06 +0000 (11:35 -0700)
committerKarl Williamson <public@khwilliamson.com>
Sat, 22 Dec 2012 18:11:27 +0000 (11:11 -0700)
When compiling a regular expression, it is too expensive to go out and
load from disk the list of code points that match a particular Posix
character class.  Some of these classes have so few code points, that
the lists are compiled in, but others are too large for that, for
example the list of code points that match \w.  Also, it is known at
compile time which of the 256 code points within the Latin-1 range match
any Posix character class.

The lists for beyond Latin-1 are demand-loaded upon first need for each,
thus the loading is deferred to execution time.  This is less efficient
than doing it a compilation time, but many many programs will never need
to load them, so the deferral is a big gain.  However, once loaded, they
stay loaded (in this thread and all sub threads).  It may be that at the
time a regex is being compiled that a list it needs has already been
loaded for some other reason.  Then, the data is already available for
free.

This commit changes to use this for some character classes whose
non-complemented forms are wanted.  Before pushing this to blead, I
realized a better way to do this, and so the code in this commit gets
ripped out some commits hence.  If I try to rebase this commit away, it
turns up too many "failure to apply" messages for the intermediate
commits, so I'm leaving this one in.

regcomp.c

index f7e0361..ffb23e0 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11373,7 +11373,17 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
  * destlist       is the inversion list for non-locale rules that this class is
  *                to be added to
  * sourcelist     is the ASCII-range inversion list to add under /a rules
- * l1_sourcelist  is the Latin1 range list to use otherwise.
+ * swash          points to the swash that contains the whole inversion list for
+ *                this class.  If NULL, it means that the swash has not been
+ *                computed yet.  This macro does not do the large amount of
+ *                work necessary to compute it, but instead uses the inversion
+ *                list from the next parameter so that all of Latin1 is known
+ *                at compile time, and the rest is deferred until run-time, as
+ *                defined by the final two parameters.  In other words, it may
+ *                be that the swash has already been loaded by the time this
+ *                macro is called, in which case we use it; but if not loaded,
+ *                we don't take the time to compute it now.
+ * l1_sourcelist  is the Latin1 range list.
  * Xpropertyname  is the name to add to <run_time_list> of the property to
  *                specify the code points above Latin1 that will have to be
  *                determined at run-time
@@ -11383,25 +11393,30 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
  * time */
 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
-                              l1_sourcelist, Xpropertyname, run_time_list) \
+                       swash, l1_sourcelist, Xpropertyname, run_time_list) \
        /* First, resolve whether to use the ASCII-only list or the L1     \
         * list */                                                         \
         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
-                Xpropertyname, run_time_list)
+                swash, Xpropertyname, run_time_list)
 
 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
-                Xpropertyname, run_time_list)                              \
+                swash, Xpropertyname, run_time_list)                       \
     /* If not /a matching, there are going to be code points we will have  \
      * to defer to runtime to look-up */                                   \
-    if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
-        Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
-    }                                                                      \
-    if (LOC) {                                                             \
-        ANYOF_CLASS_SET(node, class);                                      \
-    }                                                                      \
+    if (! AT_LEAST_ASCII_RESTRICTED && swash) {                            \
+            DO_POSIX(node, class, destlist, sourcelist, _swash_to_invlist(swash)); \
+    }                                                                  \
     else {                                                                 \
-        _invlist_union(destlist, sourcelist, &destlist);                   \
+        if (! AT_LEAST_ASCII_RESTRICTED) {                        \
+            Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
+        }                                                                  \
+        if (LOC) {                                                         \
+            ANYOF_CLASS_SET(node, class);                                  \
+        }                                                                  \
+        else {                                                             \
+            _invlist_union(destlist, sourcelist, &destlist);               \
+        }                                                                  \
     }
 
 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
@@ -11910,7 +11925,7 @@ parseit:
 
                case ANYOF_ALPHANUMERIC: /* C's alnum, in contrast to \w */
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+                        PL_PosixAlnum, PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], PL_L1PosixAlnum, "XPosixAlnum", listsv);
                    break;
                case ANYOF_NALPHANUMERIC:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -11919,7 +11934,7 @@ parseit:
                    break;
                case ANYOF_ALPHA:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+                        PL_PosixAlpha, PL_utf8_swash_ptrs[_CC_ALPHA], PL_L1PosixAlpha, "XPosixAlpha", listsv);
                    break;
                case ANYOF_NALPHA:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -12021,7 +12036,7 @@ parseit:
                     * ASCII, so call the macro that doesn't have to resolve
                     * them */
                    DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
-                        PL_PosixDigit, "XPosixDigit", listsv);
+                        PL_PosixDigit, PL_utf8_swash_ptrs[_CC_DIGIT], "XPosixDigit", listsv);
                    break;
                case ANYOF_NDIGIT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -12030,7 +12045,7 @@ parseit:
                    break;
                case ANYOF_GRAPH:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+                        PL_PosixGraph, PL_utf8_swash_ptrs[_CC_GRAPH], PL_L1PosixGraph, "XPosixGraph", listsv);
                    break;
                case ANYOF_NGRAPH:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -12058,20 +12073,23 @@ parseit:
                    SV* ascii_source;
                    SV* l1_source;
                    const char *Xname;
+                    SV* swash;
 
                    if (FOLD && ! LOC) {
                        ascii_source = PL_PosixAlpha;
                        l1_source = PL_L1Cased;
                        Xname = "Cased";
+                        swash = NULL;
                    }
                    else {
                        ascii_source = PL_PosixLower;
                        l1_source = PL_L1PosixLower;
                        Xname = "XPosixLower";
+                        swash = PL_utf8_swash_ptrs[_CC_LOWER];
                    }
                    if (namedclass == ANYOF_LOWER) {
                        DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                                    ascii_source, l1_source, Xname, listsv);
+                                    ascii_source, swash, l1_source, Xname, listsv);
                    }
                    else {
                        DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
@@ -12082,7 +12100,7 @@ parseit:
                }
                case ANYOF_PRINT:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+                        PL_PosixPrint, PL_utf8_swash_ptrs[_CC_PRINT], PL_L1PosixPrint, "XPosixPrint", listsv);
                    break;
                case ANYOF_NPRINT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -12091,7 +12109,7 @@ parseit:
                    break;
                case ANYOF_PUNCT:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                        PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+                        PL_PosixPunct, PL_utf8_swash_ptrs[_CC_PUNCT], PL_L1PosixPunct, "XPosixPunct", listsv);
                    break;
                case ANYOF_NPUNCT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
@@ -12120,20 +12138,23 @@ parseit:
                    SV* ascii_source;
                    SV* l1_source;
                    const char *Xname;
+                    SV* swash;
 
                    if (FOLD && ! LOC) {
                        ascii_source = PL_PosixAlpha;
                        l1_source = PL_L1Cased;
                        Xname = "Cased";
+                        swash = NULL;
                    }
                    else {
                        ascii_source = PL_PosixUpper;
                        l1_source = PL_L1PosixUpper;
                        Xname = "XPosixUpper";
+                        swash = PL_utf8_swash_ptrs[_CC_UPPER];
                    }
                    if (namedclass == ANYOF_UPPER) {
                        DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                                    ascii_source, l1_source, Xname, listsv);
+                                    ascii_source, swash, l1_source, Xname, listsv);
                    }
                    else {
                        DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
@@ -12144,7 +12165,7 @@ parseit:
                }
                case ANYOF_WORDCHAR:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
-                            PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                            PL_PosixWord, PL_utf8_swash_ptrs[_CC_WORDCHAR], PL_L1PosixWord, "XPosixWord", listsv);
                    break;
                case ANYOF_NWORDCHAR:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,