regcomp.c: Simplify
authorKarl Williamson <khw@cpan.org>
Tue, 1 May 2018 22:42:29 +0000 (16:42 -0600)
committerKarl Williamson <khw@cpan.org>
Mon, 25 Jun 2018 13:33:30 +0000 (07:33 -0600)
Under /a pattern matching, the matches of the [:posix:] classes are
restricted to the ASCII range.  Previously, in a time/space trade-off
that favored space, we created the list of matching characters at
pattern compilation time by ANDing the full-range Posix class with the
set of ASCII characters.

But now, the tables for just the ASCII-range classes are generated
anyway, so there's no need to do that compilation-time intersection.
This slightly simplifies the code.

embedvar.h
perlapi.h
perlvars.h
regcomp.c
utf8.c

index e038ae7..1b4d67c 100644 (file)
 #define PL_GLatin1             (my_vars->GLatin1)
 #define PL_NonL1NonFinalFold   (my_vars->GNonL1NonFinalFold)
 #define PL_GNonL1NonFinalFold  (my_vars->GNonL1NonFinalFold)
+#define PL_Posix_ptrs          (my_vars->GPosix_ptrs)
+#define PL_GPosix_ptrs         (my_vars->GPosix_ptrs)
 #define PL_SB_invlist          (my_vars->GSB_invlist)
 #define PL_GSB_invlist         (my_vars->GSB_invlist)
 #define PL_SCX_invlist         (my_vars->GSCX_invlist)
index e41d61f..5b004ae 100644 (file)
--- a/perlapi.h
+++ b/perlapi.h
@@ -115,6 +115,8 @@ END_EXTERN_C
 #define PL_Latin1              (*Perl_GLatin1_ptr(NULL))
 #undef  PL_NonL1NonFinalFold
 #define PL_NonL1NonFinalFold   (*Perl_GNonL1NonFinalFold_ptr(NULL))
+#undef  PL_Posix_ptrs
+#define PL_Posix_ptrs          (*Perl_GPosix_ptrs_ptr(NULL))
 #undef  PL_SB_invlist
 #define PL_SB_invlist          (*Perl_GSB_invlist_ptr(NULL))
 #undef  PL_SCX_invlist
index b6cc9ca..ce17ece 100644 (file)
@@ -295,6 +295,7 @@ PERLVAR(G, utf8_xidcont, SV *)
 PERLVAR(G, utf8_xidstart, SV *)
 PERLVAR(G, WB_invlist, SV *)
 PERLVARA(G, XPosix_ptrs, POSIX_CC_COUNT, SV *)
+PERLVARA(G,  Posix_ptrs, POSIX_CC_COUNT, SV *)
 PERLVAR(G, utf8_toupper, SV *)
 PERLVAR(G, utf8_totitle, SV *)
 PERLVAR(G, utf8_tolower, SV *)
index 2d9dd13..f0ade29 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -5613,23 +5613,13 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                     break;
 
                 case NASCII:
-                    invert = 1;
-                    /* FALLTHROUGH */
-               case ASCII:
-                    my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
-
-                    /* This can be handled as a Posix class */
-                    goto join_posix_and_ascii;
-
                 case NPOSIXA:   /* For these, we always know the exact set of
                                    what's matched */
                     invert = 1;
                     /* FALLTHROUGH */
+               case ASCII:
                case POSIXA:
-                    assert(FLAGS(scan) != _CC_ASCII);
-                    _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
-                                          PL_XPosix_ptrs[_CC_ASCII],
-                                          &my_invlist);
+                    my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)]);
                     goto join_posix_and_ascii;
 
                case NPOSIXD:
@@ -17216,21 +17206,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     }
                 }
                 else if (  UNI_SEMANTICS
+                        || AT_LEAST_ASCII_RESTRICTED
                         || classnum == _CC_ASCII
                         || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
                                                   || classnum == _CC_XDIGIT)))
                 {
-                    /* We usually have to worry about /d and /a affecting what
-                     * POSIX classes match, with special code needed for /d
-                     * because we won't know until runtime what all matches.
-                     * But there is no extra work needed under /u, and
-                     * [:ascii:] is unaffected by /a and /d; and :digit: and
-                     * :xdigit: don't have runtime differences under /d.  So we
-                     * can special case these, and avoid some extra work below,
-                     * and at runtime. */
+                    /* We usually have to worry about /d a affecting what POSIX
+                     * classes match, with special code needed because we won't
+                     * know until runtime what all matches.  But there is no
+                     * extra work needed under /u and /a; and [:ascii:] is
+                     * unaffected by /d; and :digit: and :xdigit: don't have
+                     * runtime differences under /d.  So we can special case
+                     * these, and avoid some extra work below, and at runtime.
+                     * */
                     _invlist_union_maybe_complement_2nd(
                                                      simple_posixes,
-                                                     PL_XPosix_ptrs[classnum],
+                                                      ((AT_LEAST_ASCII_RESTRICTED)
+                                                       ? PL_Posix_ptrs[classnum]
+                                                       : PL_XPosix_ptrs[classnum]),
                                                      namedclass % 2 != 0,
                                                      &simple_posixes);
                 }
@@ -18054,26 +18047,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
     }
     if (posixes || nposixes) {
-
-        /* We have to adjust /a and /aa */
-        if (AT_LEAST_ASCII_RESTRICTED) {
-
-            /* Under /a and /aa, nothing above ASCII matches these */
-            if (posixes) {
-                _invlist_intersection(posixes,
-                                    PL_XPosix_ptrs[_CC_ASCII],
-                                    &posixes);
-            }
-
-            /* Under /a and /aa, everything above ASCII matches these
-             * complements */
-            if (nposixes) {
-                _invlist_union_complement_2nd(nposixes,
-                                              PL_XPosix_ptrs[_CC_ASCII],
-                                              &nposixes);
-            }
-        }
-
         if (! DEPENDS_SEMANTICS) {
 
             /* For everything but /d, we can just add the current 'posixes' and
diff --git a/utf8.c b/utf8.c
index 51b37c1..bae97da 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -5886,6 +5886,24 @@ Perl_init_uniprops(pTHX)
     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_VERTSPACE_invlist);
     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_XPOSIXWORD_invlist);
     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_XPOSIXXDIGIT_invlist);
+
+    PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_ASCII_invlist);
+    PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_POSIXALNUM_invlist);
+    PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_POSIXALPHA_invlist);
+    PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_POSIXBLANK_invlist);
+    PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PL_CASED_invlist);
+    PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_POSIXCNTRL_invlist);
+    PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_POSIXDIGIT_invlist);
+    PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_POSIXGRAPH_invlist);
+    PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_POSIXLOWER_invlist);
+    PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_POSIXPRINT_invlist);
+    PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_POSIXPUNCT_invlist);
+    PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_POSIXSPACE_invlist);
+    PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_POSIXUPPER_invlist);
+    PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_VERTSPACE_invlist);
+    PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_POSIXWORD_invlist);
+    PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_POSIXXDIGIT_invlist);
+
     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);