Restructure grok_bslash_[ox]

author Karl Williamson <khw@cpan.org>

Thu, 16 Jan 2020 20:17:32 +0000 (13:17 -0700)

committer Karl Williamson <khw@cpan.org>

Thu, 23 Jan 2020 22:46:56 +0000 (15:46 -0700)
author Karl Williamson <khw@cpan.org>
Thu, 16 Jan 2020 20:17:32 +0000 (13:17 -0700)
committer Karl Williamson <khw@cpan.org>
Thu, 23 Jan 2020 22:46:56 +0000 (15:46 -0700)
diff --git a/dquote.c b/dquote.c

index d6e442e..4c688b6 100644 (file)
--- a/dquote.c
+++ b/dquote.c
@@ -79,51 +79,211 @@ Perl_grok_bslash_c(pTHX_ const char   source,
      return TRUE;
  }
  
+const char *
+Perl_form_alien_digit_msg(pTHX_
+        const U8 which,           /* 8 or 16 */
+        const STRLEN valids_len,  /* length of input before first bad char */
+        const char * const first_bad, /* Ptr to that bad char */
+        const char * const send,      /* End of input string */
+        const bool UTF,               /* Is it in UTF-8? */
+        const bool braced)            /* Is it enclosed in {} */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * alien characters found in an octal or hex constant given by the inputs,
+     * and return a pointer to that SV's string.  The message looks like:
+     *
+     * Non-hex character '?' terminates \x early.  Resolved as "\x{...}"
+     *
+     */
+
+    /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
+     * (leading zeros could take up more space, and the scalar will
+     * automatically grow if necessary).  Space for NUL is added by the newSV()
+     * function */
+    SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
+    SV * message_sv = sv_newmortal();
+    char symbol;
+
+    PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
+    assert(which == 8 || which == 16);
+
+    /* Calculate the display form of the character */
+    if (    UVCHR_IS_INVARIANT(*first_bad)
+        || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
+    {
+        pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
+                                                    (STRLEN) -1, UNI_DISPLAY_QQ);
+    }
+    else {  /* Is not UTF-8, or is illegal UTF-8.  Show just the one byte */
+
+        /* It also isn't a UTF-8 invariant character, so no display shortcuts
+         * are available.  Use \\x{...} */
+       Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
+    }
+
+    /* Ready to start building the message */
+    sv_setpvs(message_sv, "Non-");
+    if (which == 8) {
+        sv_catpvs(message_sv, "octal");
+        if (braced) {
+            symbol = 'o';
+        }
+        else {
+            symbol = '0';   /* \008, for example */
+        }
+    }
+    else {
+        sv_catpvs(message_sv, "hex");
+        symbol = 'x';
+    }
+    sv_catpvs(message_sv, " character ");
+
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    sv_catsv(message_sv, display_char);
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early.  Resolved as "
+                                     "\"\\%c", symbol, symbol);
+    if (braced) {
+        sv_catpvs(message_sv, "{");
+    }
+
+    /* Octal constants have an extra leading 0, but \0 already includes that */
+    if (symbol == 'o' && valids_len < 3) {
+        sv_catpvs(message_sv, "0");
+    }
+    if (valids_len == 0) {  /* No legal digits at all */
+        sv_catpvs(message_sv, "00");
+    }
+    else if (valids_len == 1) { /* Just one is legal */
+        sv_catpvs(message_sv, "0");
+    }
+    sv_catpvn(message_sv, first_bad - valids_len, valids_len);
+
+    if (braced) {
+        sv_catpvs(message_sv, "}");
+    }
+    else {
+        sv_catsv(message_sv, display_char);
+    }
+    sv_catpvs(message_sv, "\"");
+
+    SvREFCNT_dec_NN(display_char);
+
+    return SvPVX_const(message_sv);
+}
+
+const char *
+Perl_form_cp_too_large_msg(pTHX_
+        const U8 which,        /* 8 or 16 */
+        const char * string,   /* NULL, or the text that is supposed to
+                                  represent a code point */
+        const Size_t len,      /* length of 'string' if not NULL; else 0 */
+        const UV cp)           /* 0 if 'string' not NULL; else the too-large
+                                  code point */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * code points that are too large for this system, given by the inputs,
+     * and return a pointer to that SV's string.  Either the text of the string
+     * to be converted to a code point is input, or a code point itself.  The
+     * former is needed to accurately represent something that overflows.
+     *
+     * The message looks like:
+     *
+     * Use of code point %s is not allowed; the permissible max is %s
+     *
+     */
+
+    SV * message_sv = sv_newmortal();
+    const char * format;
+    const char * prefix;
+
+    PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
+    assert(which == 8 || which == 16);
+
+    /* One but not both must be non-zero */
+    assert((string != NULL) ^ (cp != 0));
+    assert((string == NULL) || len);
+
+    if (which == 8) {
+        format = "%" UVof;
+        prefix = "0";
+    }
+    else {
+        format = "%" UVXf;
+        prefix = "0x";
+    }
+
+    Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
+    if (string) {
+        Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
+    }
+    else {
+        Perl_sv_catpvf(aTHX_ message_sv, format, cp);
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
+    Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
+
+    return SvPVX_const(message_sv);
+}
+
  bool
  Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                        const char** message,
-                      const bool output_warning, const bool strict,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                        const bool UTF)
  {
  
  /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
- *  from; otherwise TRUE.  In either case the caller should look at *len [???].
- *  It guarantees that the returned codepoint, *uv, when expressed as
- *  utf8 bytes, would fit within the skipped "\o{...}" bytes.
- *  On input:
+ *  This returns FALSE if there is an error the caller should probably die
+ *  from; otherwise TRUE.
   *     s   is the address of a pointer to a string.  **s is 'o', and the
   *         previous character was a backslash.  At exit, *s will be advanced
   *         to the byte just after those absorbed by this function.  Hence the
- *         caller can continue parsing from there.  In the case of an error,
- *         this routine has generally positioned *s to point just to the right
- *         of the first bad spot, so that a message that has a "<--" to mark
- *         the spot will be correctly positioned.
+ *         caller can continue parsing from there.  In the case of an error
+ *         when this function returns FALSE, continuing to parse is not an
+ *         option, this routine has generally positioned *s to point just to
+ *         the right of the first bad spot, so that a message that has a "<--"
+ *         to mark the spot will be correctly positioned.
   *     send - 1  gives a limit in *s that this function is not permitted to
   *         look beyond.  That is, the function may look at bytes only in the
   *         range *s..send-1
   *     uv  points to a UV that will hold the output value, valid only if the
- *         return from the function is TRUE
- *      message is a pointer that will be set to an internal buffer giving an
- *         error message upon failure (the return is FALSE).  Untouched if
- *         function succeeds
- *     output_warning says whether to output any warning messages, or suppress
- *         them
+ *         return from the function is TRUE; may be changed from the input
+ *         value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
   *     strict is true if this should fail instead of warn if there are
   *         non-octal digits within the braces
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
   *     UTF is true iff the string *s is encoded in UTF-8.
   */
      char* e;
      STRLEN numbers_len;
      I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-               | PERL_SCAN_DISALLOW_PREFIX
-                | PERL_SCAN_SILENT_NON_PORTABLE
-               | PERL_SCAN_SILENT_ILLDIGIT;
+              | PERL_SCAN_DISALLOW_PREFIX
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_SILENT_OVERFLOW;
  
      PERL_ARGS_ASSERT_GROK_BSLASH_O;
  
      assert(*(*s - 1) == '\\');
      assert(* *s       == 'o');
+
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
      (*s)++;
  
      if (send <= *s || **s != '{') {
@@ -145,29 +305,40 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                    sequence if not a digit) */
      numbers_len = e - *s;
      if (numbers_len == 0) {
-        (*s)++;    /* Move past the } */
+        (*s)++;    /* Move past the '}' */
         *message = "Empty \\o{}";
         return FALSE;
      }
  
      *uv = grok_oct(*s, &numbers_len, &flags, NULL);
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
+
      /* Note that if has non-octal, will ignore everything starting with that up
       * to the '}' */
-
      if (numbers_len != (STRLEN) (e - *s)) {
+        *s += numbers_len;
          if (strict) {
-            *s += numbers_len;
              *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
              *message = "Non-octal character";
              return FALSE;
          }
-        else if (output_warning) {
-            Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
-            /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
-                        "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
-                        *(*s + numbers_len),
-                        (int) numbers_len,
-                        *s);
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
+                                                                      UTF, TRUE);
+            if (packed_warn) {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+            else {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
          }
      }
  
@@ -178,14 +349,16 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
  }
  
  bool
-Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
+Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
                        const char** message,
-                      const bool output_warning, const bool strict,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                        const bool UTF)
  {
  
  /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
+ *  This returns FALSE if there is an error the caller should probably die
   *  from; otherwise TRUE.
   *  It guarantees that the returned codepoint, *uv, when expressed as
   *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
@@ -202,29 +375,38 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
   *         look beyond.  That is, the function may look at bytes only in the
   *         range *s..send-1
   *     uv  points to a UV that will hold the output value, valid only if the
- *         return from the function is TRUE
- *      message is a pointer that will be set to an internal buffer giving an
- *         error message upon failure (the return is FALSE).  Untouched if
- *         function succeeds
- *     output_warning says whether to output any warning messages, or suppress
- *         them
+ *         return from the function is TRUE; may be changed from the input
+ *         value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
   *     strict is true if anything out of the ordinary should cause this to
   *         fail instead of warn or be silent.  For example, it requires
   *         exactly 2 digits following the \x (when there are no braces).
   *         3 digits could be a mistake, so is forbidden in this mode.
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
   *     UTF is true iff the string *s is encoded in UTF-8.
   */
      char* e;
      STRLEN numbers_len;
      I32 flags = PERL_SCAN_DISALLOW_PREFIX
-              | PERL_SCAN_SILENT_NON_PORTABLE;
-
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_NOTIFY_ILLDIGIT
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_OVERFLOW;
  
      PERL_ARGS_ASSERT_GROK_BSLASH_X;
  
      assert(*(*s - 1) == '\\');
      assert(* *s      == 'x');
  
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
      (*s)++;
  
      if (send <= *s) {
@@ -239,24 +421,34 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
          return TRUE;
      }
  
-    if (strict || ! output_warning) {
-        flags |= PERL_SCAN_SILENT_ILLDIGIT;
-    }
-
      if (**s != '{') {
-        STRLEN len = (strict) ? 3 : 2;
-
-       *uv = grok_hex(*s, &len, &flags, NULL);
-       *s += len;
-        if (strict && len != 2) {
-            if (len < 2) {
-                *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-                *message = "Non-hex character";
-            }
-            else {
+        numbers_len = (strict) ? 3 : 2;
+
+       *uv = grok_hex(*s, &numbers_len, &flags, NULL);
+       *s += numbers_len;
+
+        if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
+            if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
                  *message = "Use \\x{...} for more than two hex characters";
+                return FALSE;
+            }
+            else if (strict) {
+                    *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+                    *message = "Non-hex character";
+                    return FALSE;
+            }
+            else if (ckWARN(WARN_DIGIT)) {
+                const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                              send, UTF, FALSE);
+
+                if (! packed_warn) {
+                    Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+                }
+                else {
+                    *message = failure;
+                    *packed_warn = packWARN(WARN_DIGIT);
+                }
              }
-            return FALSE;
          }
         return TRUE;
      }
@@ -264,7 +456,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
      e = (char *) memchr(*s, '}', send - *s);
      if (!e) {
          (*s)++;  /* Move past the '{' */
-        while (isXDIGIT(**s)) { /* Position beyond the legal digits */
+        while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */
              (*s)++;
          }
          /* XXX The corresponding message above for \o is just '\\o{'; other
@@ -291,14 +483,33 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
      flags |= PERL_SCAN_ALLOW_UNDERSCORES;
  
      *uv = grok_hex(*s, &numbers_len, &flags, NULL);
-    /* Note that if has non-hex, will ignore everything starting with that up
-     * to the '}' */
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
  
-    if (strict && numbers_len != (STRLEN) (e - *s)) {
+    if (numbers_len != (STRLEN) (e - *s)) {
          *s += numbers_len;
-        *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-        *message = "Non-hex character";
-        return FALSE;
+        if (strict) {
+            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+            *message = "Non-hex character";
+            return FALSE;
+        }
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                                send, UTF, TRUE);
+            if (! packed_warn) {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
+            else {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+        }
      }
  
      /* Return past the '}' */
diff --git a/embed.fnc b/embed.fnc

index 012a479..db38c9e 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1136,28 +1136,42 @@ Ap      |void   |vload_module|U32 flags|NN SV* name|NULLOK SV* ver|NULLOK va_list* args
  p      |OP*    |localize       |NN OP *o|I32 lex
  ApdR   |I32    |looks_like_number|NN SV *const sv
  #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
-EpRX   |bool   |grok_bslash_x  |NN char** s             \
-                               |NN const char* const send       \
-                               |NN UV* uv                       \
-                               |NN const char** message         \
-                               |const bool output_warning       \
-                               |const bool strict               \
+EpRX   |bool   |grok_bslash_x  |NN char** s                    \
+                               |NN const char* const send      \
+                               |NN UV* uv                      \
+                               |NN const char** message        \
+                               |NULLOK U32 * packed_warn       \
+                               |const bool strict              \
+                               |const bool allow_UV_MAX        \
                                 |const bool utf8
  EpRX   |bool   |grok_bslash_c  |const char source              \
                                 |NN U8 * result                 \
                                 |NN const char** message        \
                                 |NULLOK U32 * packed_warn
-EpRX   |bool   |grok_bslash_o  |NN char** s             \
-                               |NN const char* const send       \
-                               |NN UV* uv                       \
-                               |NN const char** message         \
-                               |const bool output_warning       \
-                               |const bool strict               \
+EpRX   |bool   |grok_bslash_o  |NN char** s                    \
+                               |NN const char* const send      \
+                               |NN UV* uv                      \
+                               |NN const char** message        \
+                               |NULLOK U32 * packed_warn       \
+                               |const bool strict              \
+                               |const bool allow_UV_MAX        \
                                 |const bool utf8
+EpRX   |const char *|form_alien_digit_msg|const U8 which       \
+                               |const STRLEN valids_len        \
+                               |NN const char * const first_bad\
+                               |NN const char * const send     \
+                               |const bool UTF                 \
+                               |const bool braced
  EiR    |char*|form_short_octal_warning|NN const char * const s  \
                                 |const STRLEN len
  EiRT   |I32    |regcurly       |NN const char *s
  #endif
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
+EpRX   |const char *|form_cp_too_large_msg|const U8 which      \
+                               |NULLOK const char * string     \
+                               |const Size_t len               \
+                               |const UV cp
+#endif
  AMpd   |UV     |grok_hex       |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
  Apd    |int    |grok_infnan    |NN const char** sp|NN const char *send
  Apd    |int    |grok_number    |NN const char *pv|STRLEN len|NULLOK UV *valuep
diff --git a/embed.h b/embed.h

index cd167db..426ff1e 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -1108,10 +1108,12 @@
  #define invlist_clone(a,b)     Perl_invlist_clone(aTHX_ a,b)
  #  endif
  #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
+#define form_alien_digit_msg(a,b,c,d,e,f)      Perl_form_alien_digit_msg(aTHX_ a,b,c,d,e,f)
+#define form_cp_too_large_msg(a,b,c,d) Perl_form_cp_too_large_msg(aTHX_ a,b,c,d)
  #define form_short_octal_warning(a,b)  S_form_short_octal_warning(aTHX_ a,b)
  #define grok_bslash_c(a,b,c,d) Perl_grok_bslash_c(aTHX_ a,b,c,d)
-#define grok_bslash_o(a,b,c,d,e,f,g)   Perl_grok_bslash_o(aTHX_ a,b,c,d,e,f,g)
-#define grok_bslash_x(a,b,c,d,e,f,g)   Perl_grok_bslash_x(aTHX_ a,b,c,d,e,f,g)
+#define grok_bslash_o(a,b,c,d,e,f,g,h) Perl_grok_bslash_o(aTHX_ a,b,c,d,e,f,g,h)
+#define grok_bslash_x(a,b,c,d,e,f,g,h) Perl_grok_bslash_x(aTHX_ a,b,c,d,e,f,g,h)
  #define regcurly               S_regcurly
  #  endif
  #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 3379edf..54ddebe 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -218,6 +218,14 @@ it occured.
  
  =item *
  
+L<Non-hex character '%c' terminates \x early.  Resolved as "%s"|perldiag/"Non-hex character '%c' terminates \x early.  Resolved as "%s"">
+
+This replaces a warning that was much less specific, and which gave
+false information.  This new warning parallels the similar
+already-existing one raised for C<\o{}>.
+
+=item *
+
  L<message|perldiag/"message">
  
  =back
@@ -260,6 +268,15 @@ now has extra text added at the end, when raised during regular
  expression pattern compilation, marking where precisely in the pattern
  it occured.
  
+=item *
+
+L<Non-octal character '%c' terminates \o early.  Resolved as "%s"|perldiag/"Non-octal character '%c' terminates \o early.  Resolved as "%s"">
+
+now includes the phrase "terminates \o early", and has extra text added
+at the end, when raised during regular expression pattern compilation,
+marking where precisely in the pattern it occured.  In some instances
+the text of the resolution has been clarified.
+
  =back
  
  =head1 Utility Changes
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 3830003..11750fb 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4152,6 +4152,15 @@ a hex one was expected, like
   (?[ [ \xDG ] ])
   (?[ [ \x{DEKA} ] ])
  
+=item Non-hex character '%c' terminates \x early.  Resolved as "%s"
+
+(W digit) In parsing a hexadecimal numeric constant, a character was
+unexpectedly encountered that isn't hexadecimal.  The resulting value
+is as indicated.
+
+Note that, within braces, every character starting with the first
+non-hexadecimal up to the ending brace is ignored.
+
  =item Non-octal character in regex; marked by S<<-- HERE> in m/%s/
  
  (F) In a regular expression, there was a non-octal character where
@@ -4159,12 +4168,15 @@ an octal one was expected, like
  
   (?[ [ \o{1278} ] ])
  
-=item Non-octal character '%c'.  Resolved as "%s"
+=item Non-octal character '%c' terminates \o early.  Resolved as "%s"
  
  (W digit) In parsing an octal numeric constant, a character was
  unexpectedly encountered that isn't octal.  The resulting value
  is as indicated.
  
+Note that, within braces, every character starting with the first
+non-octal up to the ending brace is ignored.
+
  =item "no" not allowed in expression
  
  (F) The "no" keyword is recognized and executed at compile time, and
diff --git a/proto.h b/proto.h

index d5fb2ca..e1f8864 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5905,6 +5905,15 @@ PERL_CALLCONV SV*        Perl_invlist_clone(pTHX_ SV* const invlist, SV* newlist);
         assert(invlist)
  #endif
  #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
+PERL_CALLCONV const char *     Perl_form_alien_digit_msg(pTHX_ const U8 which, const STRLEN valids_len, const char * const first_bad, const char * const send, const bool UTF, const bool braced)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG  \
+       assert(first_bad); assert(send)
+
+PERL_CALLCONV const char *     Perl_form_cp_too_large_msg(pTHX_ const U8 which, const char * string, const Size_t len, const UV cp)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG
+
  #ifndef PERL_NO_INLINE_FUNCTIONS
  PERL_STATIC_INLINE char*       S_form_short_octal_warning(pTHX_ const char * const s, const STRLEN len)
                         __attribute__warn_unused_result__;
@@ -5917,12 +5926,12 @@ PERL_CALLCONV bool      Perl_grok_bslash_c(pTHX_ const char source, U8 * result, cons
  #define PERL_ARGS_ASSERT_GROK_BSLASH_C \
         assert(result); assert(message)
  
-PERL_CALLCONV bool     Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, const bool output_warning, const bool strict, const bool utf8)
+PERL_CALLCONV bool     Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT_GROK_BSLASH_O \
         assert(s); assert(send); assert(uv); assert(message)
  
-PERL_CALLCONV bool     Perl_grok_bslash_x(pTHX_ char** s, const char* const send, UV* uv, const char** message, const bool output_warning, const bool strict, const bool utf8)
+PERL_CALLCONV bool     Perl_grok_bslash_x(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT_GROK_BSLASH_X \
         assert(s); assert(send); assert(uv); assert(message)
diff --git a/regcomp.c b/regcomp.c

index 93bacdf..85d7555 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -14087,56 +14087,51 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         p++;
                         break;
                     case 'o':
-                       {
-                           UV result;
-                           const char* error_msg;
-
-                           bool valid = grok_bslash_o(&p,
-                                                       RExC_end,
-                                                      &result,
-                                                      &error_msg,
-                                                      TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       UTF);
-                           if (! valid) {
-                               RExC_parse = p; /* going to die anyway; point
-                                                  to exact spot of failure */
-                               vFAIL(error_msg);
-                           }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
-                           break;
-                       }
+                        if (! grok_bslash_o(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p; /* going to die anyway; point to
+                                               exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
+                        break;
                     case 'x':
-                       {
-                            UV result = UV_MAX; /* initialize to erroneous
-                                                   value */
-                           const char* error_msg;
-
-                           bool valid = grok_bslash_x(&p,
-                                                       RExC_end,
-                                                      &result,
-                                                      &error_msg,
-                                                       TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       UTF);
-                           if (! valid) {
-                               RExC_parse = p; /* going to die anyway; point
-                                                  to exact spot of failure */
-                               vFAIL(error_msg);
-                           }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
+                        if (! grok_bslash_x(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p;    /* going to die anyway; point
+                                                   to exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
  
  #ifdef EBCDIC
-                            if (ender < 0x100) {
-                                if (RExC_recode_x_to_native) {
-                                    ender = LATIN1_TO_NATIVE(ender);
-                                }
-                           }
+                        if (ender < 0x100) {
+                            if (RExC_recode_x_to_native) {
+                                ender = LATIN1_TO_NATIVE(ender);
+                            }
+                        }
  #endif
-                           break;
-                       }
+                        break;
                     case 'c':
                          p++;
                          if (! grok_bslash_c(*p, &grok_c_char,
@@ -16960,6 +16955,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
      PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
  
      if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+        CLEAR_POSIX_WARNINGS();
          return;
      }
  
@@ -17643,38 +17639,42 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             case 'a':   value = '\a';                   break;
             case 'o':
                 RExC_parse--;   /* function expects to be pointed at the 'o' */
-               {
-                   const char* error_msg;
-                   bool valid = grok_bslash_o(&RExC_parse,
-                                               RExC_end,
-                                              &value,
-                                              &error_msg,
-                                               TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               UTF);
-                   if (! valid) {
-                       vFAIL(error_msg);
-                   }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-               }
+                if (! grok_bslash_o(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            range, /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
                  non_portable_endpoint++;
                 break;
             case 'x':
                 RExC_parse--;   /* function expects to be pointed at the 'x' */
-               {
-                   const char* error_msg;
-                   bool valid = grok_bslash_x(&RExC_parse,
-                                               RExC_end,
-                                              &value,
-                                              &error_msg,
-                                              TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               UTF);
-                    if (! valid) {
-                       vFAIL(error_msg);
-                   }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-               }
+                if (!  grok_bslash_x(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            range, /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
                  non_portable_endpoint++;
                 break;
             case 'c':
@@ -17984,6 +17984,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
         /* non-Latin1 code point implies unicode semantics. */
         if (value > 255) {
+            if (value > MAX_LEGAL_CP && (   value != UV_MAX
+                                         || prevvalue > MAX_LEGAL_CP))
+            {
+                vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
+            }
              REQUIRE_UNI_RULES(flagp, 0);
              if (  ! silence_non_portable
                  &&  UNICODE_IS_PERL_EXTENDED(value)
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp

index b10680b..50b85fd 100644 (file)
--- a/t/lib/warnings/regcomp
+++ b/t/lib/warnings/regcomp
@@ -18,8 +18,8 @@ no warnings 'digit' ;
  my $a = qr/\o{1238456}\x{100}/;
  my $a = qr/[\o{6548321}]\x{100}/;
  EXPECT
-Non-octal character '8'.  Resolved as "\o{123}" at - line 3.
-Non-octal character '8'.  Resolved as "\o{654}" at - line 4.
+Non-octal character '8' terminates \o early.  Resolved as "\o{123}" in regex; marked by <-- HERE in m/\o{1238456} <-- HERE \x{100}/ at - line 3.
+Non-octal character '8' terminates \o early.  Resolved as "\o{654}" in regex; marked by <-- HERE in m/[\o{6548321} <-- HERE ]\x{100}/ at - line 4.
  ########
  # regcomp.c
  BEGIN {
diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke

index e875874..e22f51e 100644 (file)
--- a/t/lib/warnings/toke
+++ b/t/lib/warnings/toke
@@ -1378,7 +1378,7 @@ my $a = "\o{1238456}";
  no warnings 'digit' ;
  my $a = "\o{1238456}";
  EXPECT
-Non-octal character '8'.  Resolved as "\o{123}" at - line 3.
+Non-octal character '8' terminates \o early.  Resolved as "\o{123}" at - line 3.
  ########
  # toke.c
  use warnings;
@@ -1527,9 +1527,9 @@ print "aa" =~ m{^a{1,2}$}, "G\n";
  print "aq" =~ m[^a\[a-z\]$], "H\n";
  print "aq" =~ m(^a\(q\)$), "I\n";
  EXPECT
-Illegal hexadecimal digit '\' ignored at - line 5.
-Illegal hexadecimal digit '\' ignored at - line 7.
-Illegal hexadecimal digit '\' ignored at - line 9.
+Non-hex character '\\' terminates \x early.  Resolved as "\x00\\" in regex; marked by <-- HERE in m/^a\x <-- HERE \{61\}$/ at - line 5.
+Non-hex character '\\' terminates \x early.  Resolved as "\x00\\" in regex; marked by <-- HERE in m/^a\\\x <-- HERE \{6F\}$/ at - line 7.
+Non-hex character '\\' terminates \x early.  Resolved as "\x00\\" in regex; marked by <-- HERE in m/^a\\\\\x <-- HERE \{6F\}$/ at - line 9.
  A
  B
  1C
diff --git a/t/re/anyof.t b/t/re/anyof.t

index 9b9c030..6434f93 100644 (file)
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -485,7 +485,6 @@ my @tests = (
      '[\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
      '[\x{101}-{HIGHEST_CP}]' => 'ANYOFH[0101-HIGHEST_CP]',
      '[\x{102}\x{104}]' => 'ANYOFHb[0102 0104]',
-    '[\x{102}-\x{104}{INFTY}]' => 'ANYOFH[0102-0104 INFTY-INFTY]',
      '[\x{102}-\x{104}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 HIGHEST_CP]',
      '[\x{102}-\x{104}\x{101}]' => 'ANYOFRb[0101-0104]',
      '[\x{102}-\x{104}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
@@ -505,7 +504,6 @@ my @tests = (
      '[\x{102}-\x{104}\x{106}]' => 'ANYOFHb[0102-0104 0106]',
      '[\x{102}-\x{104}\x{106}-{INFTY}]' => 'ANYOFH[0102-0104 0106-INFTY]',
      '[\x{102}-\x{104}\x{106}-{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0106-HIGHEST_CP]',
-    '[\x{102}-\x{104}\x{108}-\x{10A}{INFTY}]' => 'ANYOFH[0102-0104 0108-010A INFTY-INFTY]',
      '[\x{102}-\x{104}\x{108}-\x{10A}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0108-010A HIGHEST_CP]',
      '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}]' => 'ANYOFHb[0101-0104 0108-010A]',
      '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
@@ -603,14 +601,12 @@ my @tests = (
      '[\x{106}-{INFTY}\x{107}-{INFTY}]' => 'ANYOFH[0106-INFTY]',
      '[\x{106}-{INFTY}\x{107}-{HIGHEST_CP}]' => 'ANYOFH[0106-INFTY]',
      '[\x{106}-{INFTY}\x{107}-\x{107}]' => 'ANYOFH[0106-INFTY]',
-    '[\x{10C}-{INFTY}{INFTY}]' => 'ANYOFH[010C-INFTY]',
      '[\x{10C}-{INFTY}{HIGHEST_CP}]' => 'ANYOFH[010C-INFTY]',
      '[\x{10C}-{INFTY}\x{00}-{HIGHEST_CP}]' => 'SANY',
      '[\x{10C}-{INFTY}\x{00}-{INFTY}]' => 'SANY',
      '[\x{10C}-{INFTY}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
      '[\x{10C}-{INFTY}\x{101}-{HIGHEST_CP}]' => 'ANYOFH[0101-INFTY]',
      '[\x{10C}-{INFTY}\x{102}\x{104}]' => 'ANYOFH[0102 0104 010C-INFTY]',
-    '[\x{10C}-{INFTY}\x{102}-\x{104}{INFTY}]' => 'ANYOFH[0102-0104 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{100}]' => 'ANYOFH[0100 0102-0104 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{101}]' => 'ANYOFH[0101-0104 010C-INFTY]',
@@ -631,7 +627,6 @@ my @tests = (
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{106}]' => 'ANYOFH[0102-0104 0106 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{106}-{INFTY}]' => 'ANYOFH[0102-0104 0106-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{106}-{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0106-INFTY]',
-    '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}{INFTY}]' => 'ANYOFH[0102-0104 0108-010A 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0108-010A 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}\x{101}]' => 'ANYOFH[0101-0104 0108-010A 010C-INFTY]',
      '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t

index c7d51d9..2793e9d 100644 (file)
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -332,25 +332,23 @@ my @death_only_under_strict = (
      'm/[\xABC]/' => "",
                   => 'Use \x{...} for more than two hex characters {#} m/[\xABC{#}]/',
  
-    # XXX This is a confusing error message.  The G isn't ignored; it just
-    # terminates the \x.  Also some messages below are missing the <-- HERE,
-    # aren't all category 'regexp'.  (Hence we have to turn off 'digit'
-    # messages as well below)
-    'm/\xAG/' => 'Illegal hexadecimal digit \'G\' ignored',
+    # some messages below aren't all category 'regexp'.  (Hence we have to
+    # turn off 'digit' messages as well below)
+    'm/\xAG/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x0AG" {#} m/\xA{#}G/',
                => 'Non-hex character {#} m/\xAG{#}/',
-    'm/[\xAG]/' => 'Illegal hexadecimal digit \'G\' ignored',
+    'm/[\xAG]/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x0AG" {#} m/[\xA{#}G]/',
                  => 'Non-hex character {#} m/[\xAG{#}]/',
-    'm/\o{789}/' => 'Non-octal character \'8\'.  Resolved as "\o{7}"',
+    'm/\o{789}/' => 'Non-octal character \'8\' terminates \o early.  Resolved as "\o{007}" {#} m/\o{789}{#}/',
                   => 'Non-octal character {#} m/\o{78{#}9}/',
-    'm/[\o{789}]/' => 'Non-octal character \'8\'.  Resolved as "\o{7}"',
+    'm/[\o{789}]/' => 'Non-octal character \'8\' terminates \o early.  Resolved as "\o{007}" {#} m/[\o{789}{#}]/',
                     => 'Non-octal character {#} m/[\o{78{#}9}]/',
      'm/\x{}/' => "",
                => 'Empty \x{} {#} m/\x{}{#}/',
      'm/[\x{}]/' => "",
                  => 'Empty \x{} {#} m/[\x{}{#}]/',
-    'm/\x{ABCDEFG}/' => 'Illegal hexadecimal digit \'G\' ignored',
+    'm/\x{ABCDEFG}/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x{ABCDEF}" {#} m/\x{ABCDEFG}{#}/',
                       => 'Non-hex character {#} m/\x{ABCDEFG{#}}/',
-    'm/[\x{ABCDEFG}]/' => 'Illegal hexadecimal digit \'G\' ignored',
+    'm/[\x{ABCDEFG}]/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x{ABCDEF}" {#} m/[\x{ABCDEFG}{#}]/',
                         => 'Non-hex character {#} m/[\x{ABCDEFG{#}}]/',
      "m'[\\y]\\x{100}'" => 'Unrecognized escape \y in character class passed through {#} m/[\y{#}]\x{100}/',
                         => 'Unrecognized escape \y in character class {#} m/[\y{#}]\x{100}/',
diff --git a/toke.c b/toke.c

index 41e6930..d54e79e 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3552,12 +3552,13 @@ S_scan_const(pTHX_ char *start)
                 {
                     const char* error;
  
-                   bool valid = grok_bslash_o(&s, send,
+                   if (! grok_bslash_o(&s, send,
                                                 &uv, &error,
-                                               TRUE, /* Output warning */
+                                               NULL,
                                                 FALSE, /* Not strict */
-                                               UTF);
-                   if (! valid) {
+                                               FALSE, /* No illegal cp's */
+                                               UTF))
+                    {
                         yyerror(error);
                         uv = 0; /* drop through to ensure range ends are set */
                     }
@@ -3569,12 +3570,13 @@ S_scan_const(pTHX_ char *start)
                 {
                     const char* error;
  
-                   bool valid = grok_bslash_x(&s, send,
+                   if (! grok_bslash_x(&s, send,
                                                 &uv, &error,
-                                               TRUE, /* Output warning */
+                                               NULL,
                                                 FALSE, /* Not strict */
-                                               UTF);
-                   if (! valid) {
+                                               FALSE, /* No illegal cp's */
+                                               UTF))
+                    {
                         yyerror(error);
                         uv = 0; /* drop through to ensure range ends are set */
                     }
author	Karl Williamson <khw@cpan.org>
	Thu, 16 Jan 2020 20:17:32 +0000 (13:17 -0700)
committer	Karl Williamson <khw@cpan.org>
	Thu, 23 Jan 2020 22:46:56 +0000 (15:46 -0700)
dquote.c		patch \| blob \| blame \| history
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
t/lib/warnings/regcomp		patch \| blob \| blame \| history
t/lib/warnings/toke		patch \| blob \| blame \| history
t/re/anyof.t		patch \| blob \| blame \| history
t/re/reg_mesg.t		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history