regcomp.c: Add capability for stricter parsing of []
authorKarl Williamson <public@khwilliamson.com>
Thu, 10 Jan 2013 23:06:53 +0000 (16:06 -0700)
committerKarl Williamson <public@khwilliamson.com>
Fri, 11 Jan 2013 18:50:38 +0000 (11:50 -0700)
This adds the capability, currently unused, of forbidding certain things
in [bracketed] character classes.  Included are things that warn bug
still compile, such as false ranges, [\d-\w], and unrecognized escapes.

Also forbidden are potentially ambiguous cases where \x (without braces)
isn't followed by exactly two hex digits, or \000 where the number of
octal digits isn't precisely three.

regcomp.c
t/porting/diag.t

index 06d7950..14e92df 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11335,6 +11335,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                        character; used under /i */
     UV n;
     char * stop_ptr = RExC_end;    /* where to stop parsing */
+    const bool strict = FALSE;
 
     /* Unicode properties are stored in a swash; this holds the current one
      * being parsed.  If this swash is the only above-latin1 component of the
@@ -11461,7 +11462,7 @@ parseit:
             && RExC_parse < RExC_end
             && POSIXCC(UCHARAT(RExC_parse)))
         {
-            namedclass = regpposixcc(pRExC_state, value, listsv, FALSE);
+            namedclass = regpposixcc(pRExC_state, value, listsv, strict);
         }
         else if (value == '\\') {
            if (UTF) {
@@ -11645,7 +11646,7 @@ parseit:
                                               &error_msg,
                                                SIZE_ONLY,   /* warnings in pass
                                                                1 only */
-                                               FALSE, /* Not strict */
+                                               strict,
                                                silence_non_portable,
                                                UTF);
                    if (! valid) {
@@ -11664,7 +11665,7 @@ parseit:
                                               &value,
                                               &error_msg,
                                               TRUE, /* Output warnings */
-                                               FALSE, /* Not strict */
+                                               strict,
                                                silence_non_portable,
                                                UTF);
                     if (! valid) {
@@ -11682,9 +11683,15 @@ parseit:
                {
                    /* Take 1-3 octal digits */
                    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
-                   numlen = 3;
+                    numlen = (strict) ? 4 : 3;
                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
+                    if (strict) {
+                        if (numlen != 3) {
+                            RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                            vFAIL("Need exactly 3 octal digits");
+                        }
+                    }
                    if (PL_encoding && value < 0x100)
                        goto recode_encoding;
                    break;
@@ -11693,19 +11700,31 @@ parseit:
                if (! RExC_override_recoding) {
                    SV* enc = PL_encoding;
                    value = reg_recode((const char)(U8)value, &enc);
-                   if (!enc && SIZE_ONLY)
-                       ckWARNreg(RExC_parse,
+                   if (!enc) {
+                        if (strict) {
+                            vFAIL("Invalid escape in the specified encoding");
+                        }
+                        else if (SIZE_ONLY) {
+                            ckWARNreg(RExC_parse,
                                  "Invalid escape in the specified encoding");
+                        }
+                    }
                    break;
                }
            default:
                /* Allow \_ to not give an error */
                if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
                    SAVEFREESV(listsv);
+                    if (strict) {
+                        vFAIL2("Unrecognized escape \\%c in character class",
+                               (int)value);
+                    }
+                    else {
                    SAVEFREESV(RExC_rx_sv);
                    ckWARN2reg(RExC_parse,
                               "Unrecognized escape \\%c in character class passed through",
                               (int)value);
+                    }
                    (void)ReREFCNT_inc(RExC_rx_sv);
                    SvREFCNT_inc_simple_void_NN(listsv);
                }
@@ -11754,6 +11773,10 @@ parseit:
                        RExC_parse >= rangebegin ?
                        RExC_parse - rangebegin : 0;
                    SAVEFREESV(listsv); /* in case of fatal warnings */
+                    if (strict) {
+                        vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
+                    }
+                    else {
                    SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
                    ckWARN4reg(RExC_parse,
                               "False [] range \"%*.*s\"",
@@ -11762,6 +11785,7 @@ parseit:
                    SvREFCNT_inc_simple_void_NN(listsv);
                     cp_list = add_cp_to_invlist(cp_list, '-');
                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
+                    }
                }
 
                range = 0; /* this was not a true range */
@@ -12048,13 +12072,19 @@ parseit:
 
                /* a bad range like \w-, [:word:]- ? */
                if (namedclass > OOB_NAMEDCLASS) {
-                   if (ckWARN(WARN_REGEXP)) {
+                   if (strict || ckWARN(WARN_REGEXP)) {
                        const int w =
                            RExC_parse >= rangebegin ?
                            RExC_parse - rangebegin : 0;
+                        if (strict) {
+                            vFAIL4("False [] range \"%*.*s\"",
+                                   w, w, rangebegin);
+                        }
+                        else {
                        vWARN4(RExC_parse,
                               "False [] range \"%*.*s\"",
                               w, w, rangebegin);
+                        }
                    }
                     if (!SIZE_ONLY) {
                         cp_list = add_cp_to_invlist(cp_list, '-');
index 8657e97..d86a870 100644 (file)
@@ -631,6 +631,8 @@ Useless (%sc) - %suse /gc modifier in regex; marked by <-- HERE in m/%s/
 Useless use of (?-p) in regex; marked by <-- HERE in m/%s/
 Unmatched '%c' in POSIX class in regex; marked by <-- HERE in m/%s/
 Unmatched '[' in POSIX class in regex; marked by <-- HERE in m/%s/
+Need exactly 3 octal digits in regex; marked by <-- HERE in m/%s/
+Unrecognized escape \%c in character class in regex; marked by <-- HERE in m/%s/
 
 __CATEGORIES__
 Code point 0x%X is not Unicode, all \p{} matches fail; all \P{} matches succeed
@@ -645,3 +647,4 @@ UTF-16 surrogate U+%X
 Non-octal character in regex; marked by <-- HERE in m/%s/
 Non-hex character in regex; marked by <-- HERE in m/%s/
 Use \\x{...} for more than two hex characters in regex; marked by <-- HERE in m/%s/
+False [] range "%s" in regex; marked by <-- HERE in m/%s/