This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: refactor a static function
authorKarl Williamson <khw@cpan.org>
Sun, 20 Sep 2015 03:13:54 +0000 (21:13 -0600)
committerKarl Williamson <khw@cpan.org>
Sun, 11 Oct 2015 16:48:31 +0000 (10:48 -0600)
nextchar() advances the parse to the next byte beyond any ignorable
bytes, returning the parse pointer before the advancement.

I find this confusing, as

    foo = nextchar();

reads as if foo should point to the next character, instead of the
character where the parse already is at.  This functionality is hard for
a reader to grok, even if the name weren't misleading, as the place the
variable gets set in the source is far away from the call.  It's clearer
to say

    foo = current;
    nextchar();

This has confused others as well, as in one place several commits have
been required to get it so it works properly, and games have been played
to back up the parse if it turns out it shouldn't have been advanced,
whereas it's better to check first, then advance if it is the right
thing to do.  Ready-Fire-Aim is not a best practice.

This commit makes nextchar() return void, and changes the few places
where the en-passant value was used.

The new scheme is still buggy, as nextchar() only advances a single
byte, which may be the wrong thing to do when the pattern is UTF-8
encoded.  More work is needed to be in a position to fix this.  We have
only gotten away with this so far because apparently no one is using
non-ASCII white space under /x, and our meta characters are all ASCII,
and there are likely other things that reposition things to a character
boundary before problems have arisen.

embed.fnc
proto.h
regcomp.c

index 6cc7e2b..64837f7 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2168,7 +2168,7 @@ Ei        |void   |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \
                                |NN regnode *node|NN I32 *flagp|STRLEN len \
                                |UV code_point|bool downgradable
 Ein    |U8   |compute_EXACTish|NN RExC_state_t *pRExC_state
-Es     |char * |nextchar       |NN RExC_state_t *pRExC_state
+Es     |void   |nextchar       |NN RExC_state_t *pRExC_state
 Ein    |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p
 Es     |void   |scan_commit    |NN const RExC_state_t *pRExC_state \
                                |NN struct scan_data_t *data        \
diff --git a/proto.h b/proto.h
index d8d29e3..57ed19b 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4730,7 +4730,7 @@ STATIC U32        S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_
 STATIC I32     S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth);
 #define PERL_ARGS_ASSERT_MAKE_TRIE     \
        assert(pRExC_state); assert(startbranch); assert(first); assert(last); assert(tail)
-STATIC char *  S_nextchar(pTHX_ RExC_state_t *pRExC_state);
+STATIC void    S_nextchar(pTHX_ RExC_state_t *pRExC_state);
 #define PERL_ARGS_ASSERT_NEXTCHAR      \
        assert(pRExC_state)
 STATIC void    S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state);
index 5233499..855c26b 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10402,7 +10402,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                     /* (?(1)...) */
                    char c;
-                   char *tmp;
                     UV uv;
                     if (grok_atoUV(RExC_parse, &uv, &endptr)
                         && uv <= I32_MAX
@@ -10416,14 +10415,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     ret = reganode(pRExC_state, GROUPP, parno);
 
                  insert_if_check_paren:
-                   if (*(tmp = nextchar(pRExC_state)) != ')') {
-                        /* nextchar also skips comments, so undo its work
-                         * and skip over the the next character.
-                         */
-                        RExC_parse = tmp;
+                   if (UCHARAT(RExC_parse) != ')') {
                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
                        vFAIL("Switch condition not recognized");
                    }
+                   nextchar(pRExC_state);
                  insert_if:
                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
                     br = regbranch(pRExC_state, &flags, 1,depth+1);
@@ -10437,7 +10433,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     } else
                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
                                                           LONGJMP, 0));
-                   c = *nextchar(pRExC_state);
+                   c = UCHARAT(RExC_parse);
+                    nextchar(pRExC_state);
                    if (flags&HASWIDTH)
                        *flagp |= HASWIDTH;
                    if (c == '|') {
@@ -10458,7 +10455,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         REGTAIL(pRExC_state, ret, lastbr);
                        if (flags&HASWIDTH)
                            *flagp |= HASWIDTH;
-                       c = *nextchar(pRExC_state);
+                        c = UCHARAT(RExC_parse);
+                        nextchar(pRExC_state);
                    }
                    else
                        lastbr = NULL;
@@ -10731,10 +10729,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
         }
-       if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+       if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched (");
        }
+       nextchar(pRExC_state);
     }
     else if (!paren && RExC_parse < RExC_end) {
        if (*RExC_parse == ')') {
@@ -16416,13 +16415,13 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
    This is the (?#...) and /x friendly way of saying RExC_parse++.
 */
 
-STATIC char*
+STATIC void
 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 {
-    char* const retval = RExC_parse++;
-
     PERL_ARGS_ASSERT_NEXTCHAR;
 
+    RExC_parse++;
+
     for (;;) {
        if (RExC_end - RExC_parse >= 3
            && *RExC_parse == '('
@@ -16445,7 +16444,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
                 continue;
             }
        }
-        return retval;
+        break;
     }
 }