Extend strictness for qr/(?[ \N{} ])/
authorKarl Williamson <public@khwilliamson.com>
Sat, 19 Jan 2013 04:58:10 +0000 (21:58 -0700)
committerKarl Williamson <public@khwilliamson.com>
Sat, 19 Jan 2013 16:57:53 +0000 (09:57 -0700)
This recently added regex syntax imposes stricter rules on parsing than
normal.   However, this did not include parsing \N{} constructs that
occur within it.  This commit does that, making fatal the warnings that
come from \N{}

I will add to perldiag the newly added messages along with the others
for (?[ ]) before 5.18 ships

embed.fnc
embed.h
proto.h
regcomp.c
t/porting/diag.t
t/re/pat_advanced.t
t/re/reg_mesg.t

index 1fd1f4e..53c582d 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1983,9 +1983,10 @@ Es       |regnode*|reg_node      |NN struct RExC_state_t *pRExC_state|U8 op
 Es     |UV     |reg_recode     |const char value|NN SV **encp
 Es     |regnode*|regpiece      |NN struct RExC_state_t *pRExC_state \
                                |NN I32 *flagp|U32 depth
-Es     |bool   |grok_bslash_N  |NN struct RExC_state_t *pRExC_state \
-                               |NULLOK regnode** nodep|NULLOK UV *valuep \
-                               |NN I32 *flagp|U32 depth|bool in_char_class
+Es     |bool   |grok_bslash_N  |NN struct RExC_state_t *pRExC_state        \
+                               |NULLOK regnode** nodep|NULLOK UV *valuep   \
+                               |NN I32 *flagp|U32 depth|bool in_char_class \
+                               |const bool strict
 Es     |void   |reginsert      |NN struct RExC_state_t *pRExC_state \
                                |U8 op|NN regnode *opnd|U32 depth
 Es     |void   |regtail        |NN struct RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index 470805d..1df6ab4 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define get_invlist_previous_index_addr(a)     S_get_invlist_previous_index_addr(aTHX_ a)
 #define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a)
 #define get_invlist_zero_addr(a)       S_get_invlist_zero_addr(aTHX_ a)
-#define grok_bslash_N(a,b,c,d,e,f)     S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
+#define grok_bslash_N(a,b,c,d,e,f,g)   S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
 #define handle_sets(a,b,c,d)   S_handle_sets(aTHX_ a,b,c,d)
 #define invlist_array(a)       S_invlist_array(aTHX_ a)
 #define invlist_clone(a)       S_invlist_clone(aTHX_ a)
diff --git a/proto.h b/proto.h
index e0c3279..0d0078d 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6503,7 +6503,7 @@ PERL_STATIC_INLINE UV*    S_get_invlist_zero_addr(pTHX_ SV* invlist)
 #define PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR \
        assert(invlist)
 
-STATIC bool    S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+STATIC bool    S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, const bool strict)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_4);
 #define PERL_ARGS_ASSERT_GROK_BSLASH_N \
index 05e9fe5..a22f8ff 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -9634,7 +9634,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 }
 
 STATIC bool
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
+        const bool strict   /* Apply stricter parsing rules? */
+    )
 {
    
  /* This is expected to be called by a parser routine that has recognized '\N'
@@ -9749,9 +9751,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
        }
         else if (in_char_class) {
             if (SIZE_ONLY && in_char_class) {
-                ckWARNreg(RExC_parse,
-                        "Ignoring zero length \\N{} in character class"
-                );
+                if (strict) {
+                    RExC_parse++;   /* Position after the "}" */
+                    vFAIL("Zero length \\N{}");
+                }
+                else {
+                    ckWARNreg(RExC_parse,
+                              "Ignoring zero length \\N{} in character class");
+                }
             }
             ret = FALSE;
        }
@@ -9803,7 +9810,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
        }
 
         if (in_char_class && has_multiple_chars) {
-           ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+            if (strict) {
+                RExC_parse = endbrace;
+                vFAIL("\\N{} in character class restricted to one character");
+            }
+            else {
+                ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+            }
         }
 
         RExC_parse = endbrace + 1;
@@ -10339,7 +10352,8 @@ tryagain:
              * special treatment for quantifiers is not needed for such single
              * character sequences */
             ++RExC_parse;
-            if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+            if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
+                                FALSE /* not strict */ )) {
                 RExC_parse--;
                 goto defchar;
             }
@@ -10603,7 +10617,8 @@ tryagain:
                          * */
                         RExC_parse = p + 1;
                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
-                                            flagp, depth, FALSE))
+                                            flagp, depth, FALSE,
+                                            FALSE /* not strict */ ))
                         {
                             RExC_parse = p = oldp;
                             goto loopdone;
@@ -11988,7 +12003,8 @@ parseit:
                     from earlier versions, OTOH that behaviour was broken
                     as well. */
                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
-                                      TRUE /* => charclass */))
+                                      TRUE, /* => charclass */
+                                      strict))
                     {
                         goto parseit;
                     }
index 13f1811..7355151 100644 (file)
@@ -658,3 +658,5 @@ Operation "%s" returns its argument for UTF-16 surrogate U+%X
 Unicode surrogate U+%X is illegal in UTF-8
 UTF-16 surrogate U+%X
 False [] range "%s" in regex; marked by <-- HERE in m/%s/
+\N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/
+Zero length \N{} in regex; marked by <-- HERE in m/%s/
index a411220..60ae9d6 100644 (file)
@@ -1014,6 +1014,9 @@ sub run_tests {
         ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works';
         ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works';
 
+        eval '/(?[[\N{EMPTY-STR}]])/';
+        ok $@ && $@ =~ /Zero length \\N\{}/;
+
         undef $w;
         eval q [is("\N{TOO  MANY SPACES}", "TOO  MANY SPACES", "Multiple spaces in character name works")];
         like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but returns a deprecation warning");
index 14e9ace..30bc2d6 100644 (file)
@@ -147,7 +147,9 @@ my @death =
  'm/(?[[\w-x]])/' => 'False [] range "\w-" in regex; marked by {#} in m/(?[[\w-{#}x]])/',
  'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" in regex; marked by {#} in m/(?[[a-\pM{#}]])/',
  'm/(?[[\pM-x]])/' => 'False [] range "\pM-" in regex; marked by {#} in m/(?[[\pM-{#}x]])/',
+ 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character in regex; marked by {#} in m/(?[[\N{U+100.300{#}}]])/',
 );
+# Tests involving a user-defined charnames translator are in pat_advanced.t
 
 ##
 ## Key-value pairs of code/error of code that should have non-fatal warnings.