Add optional strict mode to grok_bslash_[xo]
authorKarl Williamson <public@khwilliamson.com>
Mon, 7 Jan 2013 05:28:33 +0000 (22:28 -0700)
committerKarl Williamson <public@khwilliamson.com>
Fri, 11 Jan 2013 18:50:35 +0000 (11:50 -0700)
This mode croaks on any iffy constructs that currently compile.  It is
not currently used; documentation of the error messages will be
delivered later.

dquote_static.c
embed.fnc
embed.h
proto.h
regcomp.c
t/porting/diag.t
toke.c

index d928e67..1ab4ebd 100644 (file)
@@ -86,7 +86,8 @@ S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warn
 
 STATIC bool
 S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg,
-                      const bool output_warning)
+                      const bool output_warning, const bool strict,
+                      const bool UTF)
 {
 
 /*  Documentation to be supplied when interface nailed down finally
@@ -107,6 +108,9 @@ S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg,
  *         function succeeds
  *     output_warning says whether to output any warning messages, or suppress
  *         them
+ *     strict is true if this should fail instead of warn if there are
+ *         non-octal digits within the braces
+ *     UTF is true iff the string *s is encoded in UTF-8.
  */
     char* e;
     STRLEN numbers_len;
@@ -150,13 +154,21 @@ S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg,
     /* Note that if has non-octal, will ignore everything starting with that up
      * to the '}' */
 
-    if (output_warning && numbers_len != (STRLEN) (e - *s)) {
+    if (numbers_len != (STRLEN) (e - *s)) {
+        if (strict) {
+            *s += numbers_len;
+            *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1;
+            *error_msg = "Non-octal character";
+            return FALSE;
+        }
+        else if (output_warning) {
        Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
        /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
                       "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
                       *(*s + numbers_len),
                       (int) numbers_len,
                       *s);
+        }
     }
 
     /* Return past the '}' */
@@ -167,7 +179,8 @@ S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg,
 
 PERL_STATIC_INLINE bool
 S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg,
-                      const bool output_warning)
+                      const bool output_warning, const bool strict,
+                      const bool UTF)
 {
 
 /*  Documentation to be supplied when interface nailed down finally
@@ -188,11 +201,15 @@ S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg,
  *         function succeeds
  *     output_warning says whether to output any warning messages, or suppress
  *         them
+ *     strict is true if anything out of the ordinary should cause this to
+ *         fail instead of warn or be silent.  For example, it requires
+ *         exactly 2 digits following the \x (when there are no braces).
+ *         3 digits could be a mistake, so is forbidden in this mode.
+ *     UTF is true iff the string *s is encoded in UTF-8.
  */
     char* e;
     STRLEN numbers_len;
-    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-               | PERL_SCAN_DISALLOW_PREFIX;
+    I32 flags = 0;
 
     PERL_ARGS_ASSERT_GROK_BSLASH_X;
 
@@ -201,11 +218,26 @@ S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg,
     assert(**s == 'x');
     (*s)++;
 
+    if (strict) {
+        flags |= PERL_SCAN_SILENT_ILLDIGIT;
+    }
+
     if (**s != '{') {
-       I32 flags = PERL_SCAN_DISALLOW_PREFIX;
-       STRLEN len = 2;
+        STRLEN len = (strict) ? 3 : 2;
+
+       flags |= PERL_SCAN_DISALLOW_PREFIX;
        *uv = grok_hex(*s, &len, &flags, NULL);
        *s += len;
+        if (strict && len != 2) {
+            if (len < 2) {
+                *s += (UTF) ? UTF8SKIP(*s) : 1;
+                *error_msg = "Non-hex character";
+            }
+            else {
+                *error_msg = "Use \\x{...} for more than two hex characters";
+            }
+            return FALSE;
+        }
        return TRUE;
     }
 
@@ -225,10 +257,28 @@ S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg,
     (*s)++;    /* Point to expected first digit (could be first byte of utf8
                   sequence if not a digit) */
     numbers_len = e - *s;
+    if (numbers_len == 0) {
+        if (strict) {
+            (*s)++;    /* Move past the } */
+            *error_msg = "Number with no digits";
+            return FALSE;
+        }
+        return TRUE;
+    }
+
+    flags |= PERL_SCAN_ALLOW_UNDERSCORES|PERL_SCAN_DISALLOW_PREFIX;
+
     *uv = grok_hex(*s, &numbers_len, &flags, NULL);
     /* Note that if has non-hex, will ignore everything starting with that up
      * to the '}' */
 
+    if (strict && numbers_len != (STRLEN) (e - *s)) {
+        *s += numbers_len;
+        *s += (UTF) ? UTF8SKIP(*s) : 1;
+        *error_msg = "Non-hex character";
+        return FALSE;
+    }
+
     /* Return past the '}' */
     *s = e + 1;
 
index 02ce9b2..bd8c2cf 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -738,10 +738,12 @@ Apd       |UV     |grok_bin       |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV
 EMsR   |char   |grok_bslash_c  |const char source|const bool utf8|const bool output_warning
 EMsR   |bool   |grok_bslash_o  |NN char** s|NN UV* uv \
                                |NN const char** error_msg   \
-                               |const bool output_warning
+                               |const bool output_warning   \
+                               |const bool strict|const bool utf8
 EMiR   |bool   |grok_bslash_x  |NN char** s|NN UV* uv \
                                |NN const char** error_msg   \
-                               |const bool output_warning
+                               |const bool output_warning   \
+                               |const bool strict|const bool utf8
 #endif
 Apd    |UV     |grok_hex       |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
 Apd    |int    |grok_number    |NN const char *pv|STRLEN len|NULLOK UV *valuep
diff --git a/embed.h b/embed.h
index 86d7760..125c6a7 100644 (file)
--- a/embed.h
+++ b/embed.h
 #  endif
 #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
 #define grok_bslash_c(a,b,c)   S_grok_bslash_c(aTHX_ a,b,c)
-#define grok_bslash_o(a,b,c,d) S_grok_bslash_o(aTHX_ a,b,c,d)
-#define grok_bslash_x(a,b,c,d) S_grok_bslash_x(aTHX_ a,b,c,d)
+#define grok_bslash_o(a,b,c,d,e,f)     S_grok_bslash_o(aTHX_ a,b,c,d,e,f)
+#define grok_bslash_x(a,b,c,d,e,f)     S_grok_bslash_x(aTHX_ a,b,c,d,e,f)
 #define regcurly(a)            S_regcurly(aTHX_ a)
 #  endif
 #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C)
diff --git a/proto.h b/proto.h
index 596f310..6123569 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6756,7 +6756,7 @@ PERL_CALLCONV SV* Perl__core_swash_init(pTHX_ const char* pkg, const char* name,
 STATIC char    S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning)
                        __attribute__warn_unused_result__;
 
-STATIC bool    S_grok_bslash_o(pTHX_ char** s, UV* uv, const char** error_msg, const bool output_warning)
+STATIC bool    S_grok_bslash_o(pTHX_ char** s, UV* uv, const char** error_msg, const bool output_warning, const bool strict, const bool utf8)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2)
@@ -6764,7 +6764,7 @@ STATIC bool       S_grok_bslash_o(pTHX_ char** s, UV* uv, const char** error_msg, cons
 #define PERL_ARGS_ASSERT_GROK_BSLASH_O \
        assert(s); assert(uv); assert(error_msg)
 
-PERL_STATIC_INLINE bool        S_grok_bslash_x(pTHX_ char** s, UV* uv, const char** error_msg, const bool output_warning)
+PERL_STATIC_INLINE bool        S_grok_bslash_x(pTHX_ char** s, UV* uv, const char** error_msg, const bool output_warning, const bool strict, const bool utf8)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2)
index c6bd79a..74de460 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10618,7 +10618,9 @@ tryagain:
                            bool valid = grok_bslash_o(&p,
                                                       &result,
                                                       &error_msg,
-                                                      TRUE); /* out warnings */
+                                                      TRUE, /* out warnings */
+                                                       FALSE, /* not strict */
+                                                       UTF);
                            if (! valid) {
                                RExC_parse = p; /* going to die anyway; point
                                                   to exact spot of failure */
@@ -10644,7 +10646,9 @@ tryagain:
                            bool valid = grok_bslash_x(&p,
                                                       &result,
                                                       &error_msg,
-                                                      TRUE); /* out warnings */
+                                                      TRUE, /* out warnings */
+                                                       FALSE, /* not strict */
+                                                       UTF);
                            if (! valid) {
                                RExC_parse = p; /* going to die anyway; point
                                                   to exact spot of failure */
@@ -11568,7 +11572,9 @@ parseit:
                    bool valid = grok_bslash_o(&RExC_parse,
                                               &value,
                                               &error_msg,
-                                              SIZE_ONLY);
+                                              SIZE_ONLY,
+                                               FALSE, /* Not strict */
+                                               UTF);
                    if (! valid) {
                        vFAIL(error_msg);
                    }
@@ -11584,7 +11590,9 @@ parseit:
                    bool valid = grok_bslash_x(&RExC_parse,
                                               &value,
                                               &error_msg,
-                                              1);
+                                              TRUE, /* Output warnings */
+                                               FALSE, /* Not strict */
+                                               UTF);
                    if (! valid) {
                        vFAIL(error_msg);
                    }
index 9d84ebb..3901f2a 100644 (file)
@@ -640,3 +640,6 @@ Operation "%s" returns its argument for non-Unicode code point 0x%X
 Operation "%s" returns its argument for UTF-16 surrogate U+%X
 Unicode surrogate U+%X is illegal in UTF-8
 UTF-16 surrogate U+%X
+Non-octal character in regex; marked by <-- HERE in m/%s/
+Non-hex character in regex; marked by <-- HERE in m/%s/
+Use \\x{...} for more than two hex characters in regex; marked by <-- HERE in m/%s/
diff --git a/toke.c b/toke.c
index 6cf5afc..dbadf60 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3290,7 +3290,9 @@ S_scan_const(pTHX_ char *start)
                    const char* error;
 
                    bool valid = grok_bslash_o(&s, &uv, &error,
-                                               TRUE); /* Output warning */
+                                               TRUE, /* Output warning */
+                                               FALSE, /* Not strict */
+                                               UTF);
                    if (! valid) {
                        yyerror(error);
                        continue;
@@ -3304,7 +3306,9 @@ S_scan_const(pTHX_ char *start)
                    const char* error;
 
                    bool valid = grok_bslash_x(&s, &uv, &error,
-                                               TRUE); /* Output warning */
+                                               TRUE, /* Output warning */
+                                               FALSE, /* Not strict */
+                                               UTF);
                    if (! valid) {
                        yyerror(error);
                        continue;