regcomp.c: Use a parameter to simplify some code
authorKarl Williamson <public@khwilliamson.com>
Mon, 31 Dec 2012 18:54:44 +0000 (11:54 -0700)
committerKarl Williamson <public@khwilliamson.com>
Fri, 11 Jan 2013 18:50:35 +0000 (11:50 -0700)
When parsing \p{} outside of a bracketed character class, code in
regcomp.c has pretended it is a bracketed character class by changing
and restoring the parsing pointers, and then calling the charclass
handler.  This code can be simplified by instead passing a flag to the
handler meaning to just parse one item.  The faking is simpler there,
with no restoring necessary.  Also we can eliminate the duplicate
handling of special cases.

Future commits will make more extensive use of this mechanism.

embed.fnc
embed.h
proto.h
regcomp.c

index 463e087..0bdbd93 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1950,7 +1950,7 @@ Es        |regnode*|regbranch     |NN struct RExC_state_t *pRExC_state \
 Es     |STRLEN |reguni         |NN const struct RExC_state_t *pRExC_state \
                                |UV uv|NN char *s
 Es     |regnode*|regclass      |NN struct RExC_state_t *pRExC_state \
-                               |NN I32 *flagp|U32 depth
+                               |NN I32 *flagp|U32 depth|const bool stop_at_1
 Es     |regnode*|reg_node      |NN struct RExC_state_t *pRExC_state|U8 op
 Es     |UV     |reg_recode     |const char value|NN SV **encp
 Es     |regnode*|regpiece      |NN struct RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index ac543e5..c0439a6 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define reganode(a,b,c)                S_reganode(aTHX_ a,b,c)
 #define regatom(a,b,c)         S_regatom(aTHX_ a,b,c)
 #define regbranch(a,b,c,d)     S_regbranch(aTHX_ a,b,c,d)
-#define regclass(a,b,c)                S_regclass(aTHX_ a,b,c)
+#define regclass(a,b,c,d)      S_regclass(aTHX_ a,b,c,d)
 #define reginsert(a,b,c,d)     S_reginsert(aTHX_ a,b,c,d)
 #define regpiece(a,b,c)                S_regpiece(aTHX_ a,b,c)
 #define regpposixcc(a,b,c)     S_regpposixcc(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index d85e9af..5f16d65 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6639,7 +6639,7 @@ STATIC regnode*   S_regbranch(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp,
 #define PERL_ARGS_ASSERT_REGBRANCH     \
        assert(pRExC_state); assert(flagp)
 
-STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
+STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_REGCLASS      \
index 496fb8f..7382753 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10099,7 +10099,8 @@ tryagain:
     case '[':
     {
        char * const oregcomp_parse = ++RExC_parse;
-        ret = regclass(pRExC_state, flagp,depth+1);
+        ret = regclass(pRExC_state, flagp,depth+1,
+                       FALSE /* means parse the whole char class */ );
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched [");
@@ -10287,32 +10288,15 @@ tryagain:
        case 'p':
        case 'P':
            {
-               char* const oldregxend = RExC_end;
 #ifdef DEBUGGING
                char* parse_start = RExC_parse - 2;
 #endif
 
-               if (RExC_parse[1] == '{') {
-                 /* a lovely hack--pretend we saw [\pX] instead */
-                   RExC_end = strchr(RExC_parse, '}');
-                   if (!RExC_end) {
-                       const U8 c = (U8)*RExC_parse;
-                       RExC_parse += 2;
-                       RExC_end = oldregxend;
-                       vFAIL2("Missing right brace on \\%c{}", c);
-                   }
-                   RExC_end++;
-               }
-               else {
-                   RExC_end = RExC_parse + 2;
-                   if (RExC_end > oldregxend)
-                       RExC_end = oldregxend;
-               }
                RExC_parse--;
 
-                ret = regclass(pRExC_state, flagp,depth+1);
+                ret = regclass(pRExC_state, flagp,depth+1,
+                               TRUE /* means just parse this element */ );
 
-               RExC_end = oldregxend;
                RExC_parse--;
 
                Set_Node_Offset(ret, parse_start + 2);
@@ -11239,7 +11223,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
 
 STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1)
 {
     /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
      * but something like [a] will produce an EXACT node; [aA], an EXACTFish
@@ -11283,6 +11267,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     AV * multi_char_matches = NULL; /* Code points that fold to more than one
                                        character; used under /i */
     UV n;
+    char * stop_ptr = RExC_end;    /* where to stop parsing */
 
     /* Unicode properties are stored in a swash; this holds the current one
      * being parsed.  If this swash is the only above-latin1 component of the
@@ -11375,12 +11360,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
        }
     }
 
+    /* If the caller wants us to just parse a single element, accomplish this
+     * by faking the loop ending condition */
+    if (stop_at_1 && RExC_end > RExC_parse) {
+        stop_ptr = RExC_parse + 1;
+    }
+
     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
     if (UCHARAT(RExC_parse) == ']')
        goto charclassloop;
 
 parseit:
-    while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
+    while (RExC_parse < stop_ptr && UCHARAT(RExC_parse) != ']') {
 
     charclassloop: