regcomp.c: Add capability for regclass() to return inversion list
authorKarl Williamson <public@khwilliamson.com>
Thu, 10 Jan 2013 23:42:19 +0000 (16:42 -0700)
committerKarl Williamson <public@khwilliamson.com>
Fri, 11 Jan 2013 18:50:38 +0000 (11:50 -0700)
This is currently unused, but will have regclass() return an inversion
list instead of a node.

embed.fnc
embed.h
proto.h
regcomp.c
t/porting/diag.t

index 88a2444..2972b6d 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1962,7 +1962,8 @@ Es        |STRLEN |reguni         |NN const struct RExC_state_t *pRExC_state \
 Es     |regnode*|regclass      |NN struct RExC_state_t *pRExC_state \
                                |NN I32 *flagp|U32 depth|const bool stop_at_1 \
                                |bool allow_multi_fold                        \
-                               |const bool silence_non_portable
+                               |const bool silence_non_portable              \
+                               |NULLOK SV** ret_invlist
 Es     |regnode*|reg_node      |NN struct RExC_state_t *pRExC_state|U8 op
 Es     |UV     |reg_recode     |const char value|NN SV **encp
 Es     |regnode*|regpiece      |NN struct RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index eff78d4..9a5b636 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define reganode(a,b,c)                S_reganode(aTHX_ a,b,c)
 #define regatom(a,b,c)         S_regatom(aTHX_ a,b,c)
 #define regbranch(a,b,c,d)     S_regbranch(aTHX_ a,b,c,d)
-#define regclass(a,b,c,d,e,f)  S_regclass(aTHX_ a,b,c,d,e,f)
+#define regclass(a,b,c,d,e,f,g)        S_regclass(aTHX_ a,b,c,d,e,f,g)
 #define reginsert(a,b,c,d)     S_reginsert(aTHX_ a,b,c,d)
 #define regpatws               S_regpatws
 #define regpiece(a,b,c)                S_regpiece(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 6816e56..0cab673 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6639,7 +6639,7 @@ STATIC regnode*   S_regbranch(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp,
 #define PERL_ARGS_ASSERT_REGBRANCH     \
        assert(pRExC_state); assert(flagp)
 
-STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable)
+STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, SV** ret_invlist)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_REGCLASS      \
index 3d1d4ce..bc48915 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10108,7 +10108,8 @@ tryagain:
         ret = regclass(pRExC_state, flagp,depth+1,
                        FALSE, /* means parse the whole char class */
                        TRUE, /* allow multi-char folds */
-                       FALSE); /* don't silence non-portable warnings. */
+                       FALSE, /* don't silence non-portable warnings. */
+                       NULL);
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched [");
@@ -10305,9 +10306,10 @@ tryagain:
                 ret = regclass(pRExC_state, flagp,depth+1,
                                TRUE, /* means just parse this element */
                                FALSE, /* don't allow multi-char folds */
-                               FALSE); /* don't silence non-portable warnings.
+                               FALSE, /* don't silence non-portable warnings.
                                          It would be a bug if these returned
                                          non-portables */
+                               NULL);
 
                RExC_parse--;
 
@@ -11291,7 +11293,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
 STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  const bool stop_at_1, bool allow_multi_folds,
-                 const bool silence_non_portable)
+                 const bool silence_non_portable, SV** ret_invlist)
 {
     /* parse a bracketed class specification.  Most of these will produce an
      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
@@ -11607,8 +11609,13 @@ parseit:
                         }
 
                         /* Here didn't find it.  It could be a user-defined
-                         * property that will be available at run-time.  Add it
-                         * to the list to look up then */
+                         * property that will be available at run-time.  If we
+                         * accept only compile-time properties, is an error;
+                         * otherwise add it to the list for run-time look up */
+                        if (ret_invlist) {
+                            RExC_parse = e + 1;
+                            vFAIL3("Property '%.*s' is unknown", (int) n, name);
+                        }
                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
                                         (value == 'p' ? '+' : '!'),
                                         name);
@@ -11889,6 +11896,18 @@ parseit:
                          * class */
                         const char *Xname = swash_property_names[classnum];
 
+                        /* If returning the inversion list, we can't defer
+                         * getting this until runtime */
+                        if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
+                            PL_utf8_swash_ptrs[classnum] =
+                                _core_swash_init("utf8", Xname, &PL_sv_undef,
+                                             1, /* binary */
+                                             0, /* not tr/// */
+                                             NULL, /* No inversion list */
+                                             NULL  /* No flags */
+                                            );
+                            assert(PL_utf8_swash_ptrs[classnum]);
+                        }
                         if ( !  PL_utf8_swash_ptrs[classnum]) {
                             if (namedclass % 2 == 0) { /* A non-complemented
                                                           class */
@@ -12350,7 +12369,7 @@ parseit:
     /* If the character class contains only a single element, it may be
      * optimizable into another node type which is smaller and runs faster.
      * Check if this is the case for this class */
-    if (element_count == 1) {
+    if (element_count == 1 && ! ret_invlist) {
         U8 op = END;
         U8 arg = 0;
 
@@ -12859,6 +12878,19 @@ parseit:
        invert = FALSE;
     }
 
+    if (ret_invlist) {
+        *ret_invlist = cp_list;
+
+        /* Discard the generated node */
+        if (SIZE_ONLY) {
+            RExC_size = orig_size;
+        }
+        else {
+            RExC_emit = orig_emit;
+        }
+        return END;
+    }
+
     /* If we didn't do folding, it's because some information isn't available
      * until runtime; set the run-time fold flag for these.  (We don't have to
      * worry about properties folding, as that is taken care of by the swash
index d86a870..8745044 100644 (file)
@@ -631,6 +631,7 @@ Useless (%sc) - %suse /gc modifier in regex; marked by <-- HERE in m/%s/
 Useless use of (?-p) in regex; marked by <-- HERE in m/%s/
 Unmatched '%c' in POSIX class in regex; marked by <-- HERE in m/%s/
 Unmatched '[' in POSIX class in regex; marked by <-- HERE in m/%s/
+Property '%s' is unknown in regex; marked by <-- HERE in m/%s/
 Need exactly 3 octal digits in regex; marked by <-- HERE in m/%s/
 Unrecognized escape \%c in character class in regex; marked by <-- HERE in m/%s/