This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Split function into two functions
authorKarl Williamson <khw@cpan.org>
Mon, 21 Sep 2015 03:25:13 +0000 (21:25 -0600)
committerKarl Williamson <khw@cpan.org>
Sun, 11 Oct 2015 16:48:31 +0000 (10:48 -0600)
Sometimes we want to move to the next non-ignored character in the
input.  The nextchar() function does that (but buggily in UTF-8).

And sometimes we are already at the next character, but if it is one
that should be ignored, we want to move to the first one that isn't.
This commit creates a function to do the second task by extracting the
code in nextchar() to it, and making nextchar() a lightweight wrapper
around it, and hence likely to be optimized out by the compiler.

This is a step in the direction of fixing the UTF-8 problems with
nextchar(), and fixing some other bugs.  The new function has added
generality which won't be used until a later commit.

embed.fnc
embed.h
proto.h
regcomp.c

index 64837f7..162caf2 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2169,6 +2169,9 @@ Ei        |void   |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \
                                |UV code_point|bool downgradable
 Ein    |U8   |compute_EXACTish|NN RExC_state_t *pRExC_state
 Es     |void   |nextchar       |NN RExC_state_t *pRExC_state
+Es     |void   |skip_to_be_ignored_text|NN RExC_state_t *pRExC_state  \
+                               |NN char ** p                       \
+                               |const bool force_to_xmod
 Ein    |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p
 Es     |void   |scan_commit    |NN const RExC_state_t *pRExC_state \
                                |NN struct scan_data_t *data        \
diff --git a/embed.h b/embed.h
index a5a624b..5dc4be8 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define regtail(a,b,c,d)       S_regtail(aTHX_ a,b,c,d)
 #define scan_commit(a,b,c,d)   S_scan_commit(aTHX_ a,b,c,d)
 #define set_ANYOF_arg(a,b,c,d,e,f,g)   S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g)
+#define skip_to_be_ignored_text(a,b,c) S_skip_to_be_ignored_text(aTHX_ a,b,c)
 #define ssc_add_range(a,b,c)   S_ssc_add_range(aTHX_ a,b,c)
 #define ssc_and(a,b,c)         S_ssc_and(aTHX_ a,b,c)
 #define ssc_anything(a)                S_ssc_anything(aTHX_ a)
diff --git a/proto.h b/proto.h
index 57ed19b..79ad937 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4803,6 +4803,9 @@ STATIC void       S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_dat
 STATIC void    S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, SV* const only_utf8_locale_list, SV* const swash, const bool has_user_defined_property);
 #define PERL_ARGS_ASSERT_SET_ANYOF_ARG \
        assert(pRExC_state); assert(node)
+STATIC void    S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, char ** p, const bool force_to_xmod);
+#define PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT       \
+       assert(pRExC_state); assert(p)
 PERL_STATIC_INLINE void        S_ssc_add_range(pTHX_ regnode_ssc *ssc, UV const start, UV const end);
 #define PERL_ARGS_ASSERT_SSC_ADD_RANGE \
        assert(ssc)
index 855c26b..91d91bc 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -16405,6 +16405,51 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
     return p;
 }
 
+STATIC void
+S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
+                                char ** p,
+                                const bool force_to_xmod
+                         )
+{
+    /* If the text at the current parse position '*p' is a '(?#...)' comment,
+     * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
+     * is /x whitespace, advance '*p' so that on exit it points to the first
+     * byte past all such white space and comments */
+
+    const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
+
+    PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
+
+    for (;;) {
+       if (RExC_end - (*p) >= 3
+           && *(*p)     == '('
+           && *(*p + 1) == '?'
+           && *(*p + 2) == '#')
+       {
+           while (*(*p) != ')') {
+               if ((*p) == RExC_end)
+                   FAIL("Sequence (?#... not terminated");
+               (*p)++;
+           }
+           (*p)++;
+           continue;
+       }
+
+       if (use_xmod) {
+            char * new_p = regpatws(pRExC_state, *p,
+                                    TRUE); /* means recognize comments */
+            if (new_p != *p) {
+                *p = new_p;
+                continue;
+            }
+       }
+
+        break;
+    }
+
+    return;
+}
+
 /* nextchar()
 
    Advances the parse position by one byte, unless that byte is the beginning
@@ -16422,30 +16467,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 
     RExC_parse++;
 
-    for (;;) {
-       if (RExC_end - RExC_parse >= 3
-           && *RExC_parse == '('
-           && RExC_parse[1] == '?'
-           && RExC_parse[2] == '#')
-       {
-           while (*RExC_parse != ')') {
-               if (RExC_parse == RExC_end)
-                   FAIL("Sequence (?#... not terminated");
-               RExC_parse++;
-           }
-           RExC_parse++;
-           continue;
-       }
-       if (RExC_flags & RXf_PMf_EXTENDED) {
-            char * p = regpatws(pRExC_state, RExC_parse,
-                                          TRUE); /* means recognize comments */
-            if (p != RExC_parse) {
-                RExC_parse = p;
-                continue;
-            }
-       }
-        break;
-    }
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                            FALSE /* Don't assume /x */ );
 }
 
 STATIC regnode *