msgrcv: properly downgrade the receive buffer

[perl5.git] / dquote.c
diff --git a/dquote.c b/dquote.c

index 10fb2b5..dcbd8c9 100644 (file)
--- a/dquote.c
+++ b/dquote.c
@@ -8,106 +8,285 @@
  #include "EXTERN.h"
  #define PERL_IN_DQUOTE_C
  #include "perl.h"
-#include "dquote_inline.h"
  
  /* XXX Add documentation after final interface and behavior is decided */
-/* May want to show context for error, so would pass S_grok_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning)
-    U8 source = *current;
-*/
  
-char
-Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
+bool
+Perl_grok_bslash_c(pTHX_ const char   source,
+                         U8 *         result,
+                         const char** message,
+                         U32 *        packed_warn)
  {
-
-    U8 result;
+    PERL_ARGS_ASSERT_GROK_BSLASH_C;
+
+    /* This returns TRUE if the \c? sequence is valid; FALSE otherwise.  If it
+     * is valid, the sequence evaluates to a single character, which will be
+     * stored into *result.
+     *
+     * source   is the character immediately after a '\c' sequence.
+     * result   points to a char variable into which this function will store
+     *          what the sequence evaluates to, if valid; unchanged otherwise.
+     * message  A pointer to any warning or error message will be stored into
+     *          this pointer; NULL if none.
+     * packed_warn if NULL on input asks that this routine display any warning
+     *          messages.  Otherwise, if the function found a warning, the
+     *          packed warning categories will be stored into *packed_warn (and
+     *          the corresponding message text into *message); 0 if none.
+     */
+
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
  
      if (! isPRINT_A(source)) {
-        Perl_croak(aTHX_ "%s",
-                        "Character following \"\\c\" must be printable ASCII");
+        *message = "Character following \"\\c\" must be printable ASCII";
+        return FALSE;
      }
-    else if (source == '{') {
+
+    if (source == '{') {
          const char control = toCTRL('{');
          if (isPRINT_A(control)) {
              /* diag_listed_as: Use "%s" instead of "%s" */
-            Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
+            *message = Perl_form(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
          }
          else {
-            Perl_croak(aTHX_ "Sequence \"\\c{\" invalid");
+            *message = "Sequence \"\\c{\" invalid";
          }
+        return FALSE;
      }
  
-    result = toCTRL(source);
-    if (output_warning && isPRINT_A(result)) {
+    *result = toCTRL(source);
+    if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
          U8 clearer[3];
          U8 i = 0;
-        if (! isWORDCHAR(result)) {
+        char format[] = "\"\\c%c\" is more clearly written simply as \"%s\"";
+
+        if (! isWORDCHAR(*result)) {
              clearer[i++] = '\\';
          }
-        clearer[i++] = result;
+        clearer[i++] = *result;
          clearer[i++] = '\0';
  
-        Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                        "\"\\c%c\" is more clearly written simply as \"%s\"",
-                        source,
-                        clearer);
+        if (packed_warn) {
+            *message = Perl_form(aTHX_ format, source, clearer);
+            *packed_warn = packWARN(WARN_SYNTAX);
+        }
+        else {
+            Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
+        }
+    }
+
+    return TRUE;
+}
+
+const char *
+Perl_form_alien_digit_msg(pTHX_
+        const U8 which,           /* 8 or 16 */
+        const STRLEN valids_len,  /* length of input before first bad char */
+        const char * const first_bad, /* Ptr to that bad char */
+        const char * const send,      /* End of input string */
+        const bool UTF,               /* Is it in UTF-8? */
+        const bool braced)            /* Is it enclosed in {} */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * alien characters found in an octal or hex constant given by the inputs,
+     * and return a pointer to that SV's string.  The message looks like:
+     *
+     * Non-hex character '?' terminates \x early.  Resolved as "\x{...}"
+     *
+     */
+
+    /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
+     * (leading zeros could take up more space, and the scalar will
+     * automatically grow if necessary).  Space for NUL is added by the newSV()
+     * function */
+    SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
+    SV * message_sv = sv_newmortal();
+    char symbol;
+
+    PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
+    assert(which == 8 || which == 16);
+
+    /* Calculate the display form of the character */
+    if (    UVCHR_IS_INVARIANT(*first_bad)
+        || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
+    {
+        pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
+                                                    (STRLEN) -1, UNI_DISPLAY_QQ);
+    }
+    else {  /* Is not UTF-8, or is illegal UTF-8.  Show just the one byte */
+
+        /* It also isn't a UTF-8 invariant character, so no display shortcuts
+         * are available.  Use \\x{...} */
+       Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
+    }
+
+    /* Ready to start building the message */
+    sv_setpvs(message_sv, "Non-");
+    if (which == 8) {
+        sv_catpvs(message_sv, "octal");
+        if (braced) {
+            symbol = 'o';
+        }
+        else {
+            symbol = '0';   /* \008, for example */
+        }
+    }
+    else {
+        sv_catpvs(message_sv, "hex");
+        symbol = 'x';
+    }
+    sv_catpvs(message_sv, " character ");
+
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    sv_catsv(message_sv, display_char);
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early.  Resolved as "
+                                     "\"\\%c", symbol, symbol);
+    if (braced) {
+        sv_catpvs(message_sv, "{");
+    }
+
+    /* Octal constants have an extra leading 0, but \0 already includes that */
+    if (symbol == 'o' && valids_len < 3) {
+        sv_catpvs(message_sv, "0");
+    }
+    if (valids_len == 0) {  /* No legal digits at all */
+        sv_catpvs(message_sv, "00");
+    }
+    else if (valids_len == 1) { /* Just one is legal */
+        sv_catpvs(message_sv, "0");
+    }
+    sv_catpvn(message_sv, first_bad - valids_len, valids_len);
+
+    if (braced) {
+        sv_catpvs(message_sv, "}");
+    }
+    else {
+        sv_catsv(message_sv, display_char);
+    }
+    sv_catpvs(message_sv, "\"");
+
+    SvREFCNT_dec_NN(display_char);
+
+    return SvPVX_const(message_sv);
+}
+
+const char *
+Perl_form_cp_too_large_msg(pTHX_
+        const U8 which,        /* 8 or 16 */
+        const char * string,   /* NULL, or the text that is supposed to
+                                  represent a code point */
+        const Size_t len,      /* length of 'string' if not NULL; else 0 */
+        const UV cp)           /* 0 if 'string' not NULL; else the too-large
+                                  code point */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * code points that are too large for this system, given by the inputs,
+     * and return a pointer to that SV's string.  Either the text of the string
+     * to be converted to a code point is input, or a code point itself.  The
+     * former is needed to accurately represent something that overflows.
+     *
+     * The message looks like:
+     *
+     * Use of code point %s is not allowed; the permissible max is %s
+     *
+     */
+
+    SV * message_sv = sv_newmortal();
+    const char * format;
+    const char * prefix;
+
+    PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
+    assert(which == 8 || which == 16);
+
+    /* One but not both must be non-zero */
+    assert((string != NULL) ^ (cp != 0));
+    assert((string == NULL) || len);
+
+    if (which == 8) {
+        format = "%" UVof;
+        prefix = "0";
+    }
+    else {
+        format = "%" UVXf;
+        prefix = "0x";
+    }
+
+    Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
+    if (string) {
+        Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
+    }
+    else {
+        Perl_sv_catpvf(aTHX_ message_sv, format, cp);
      }
+    Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
+    Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
  
-    return result;
+    return SvPVX_const(message_sv);
  }
  
  bool
  Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
-                      const char** error_msg,
-                      const bool output_warning, const bool strict,
-                      const bool silence_non_portable,
+                      const char** message,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                        const bool UTF)
  {
  
  /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
- *  from; otherwise TRUE.  In either case the caller should look at *len [???].
- *  It guarantees that the returned codepoint, *uv, when expressed as
- *  utf8 bytes, would fit within the skipped "\o{...}" bytes.
- *  On input:
+ *  This returns FALSE if there is an error the caller should probably die
+ *  from; otherwise TRUE.
   *     s   is the address of a pointer to a string.  **s is 'o', and the
   *         previous character was a backslash.  At exit, *s will be advanced
   *         to the byte just after those absorbed by this function.  Hence the
- *         caller can continue parsing from there.  In the case of an error,
- *         this routine has generally positioned *s to point just to the right
- *         of the first bad spot, so that a message that has a "<--" to mark
- *         the spot will be correctly positioned.
+ *         caller can continue parsing from there.  In the case of an error
+ *         when this function returns FALSE, continuing to parse is not an
+ *         option, this routine has generally positioned *s to point just to
+ *         the right of the first bad spot, so that a message that has a "<--"
+ *         to mark the spot will be correctly positioned.
   *     send - 1  gives a limit in *s that this function is not permitted to
   *         look beyond.  That is, the function may look at bytes only in the
   *         range *s..send-1
   *     uv  points to a UV that will hold the output value, valid only if the
- *         return from the function is TRUE
- *      error_msg is a pointer that will be set to an internal buffer giving an
- *         error message upon failure (the return is FALSE).  Untouched if
- *         function succeeds
- *     output_warning says whether to output any warning messages, or suppress
- *         them
+ *         return from the function is TRUE; may be changed from the input
+ *         value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
   *     strict is true if this should fail instead of warn if there are
   *         non-octal digits within the braces
- *      silence_non_portable is true if to suppress warnings about the code
- *          point returned being too large to fit on all platforms.
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
   *     UTF is true iff the string *s is encoded in UTF-8.
   */
      char* e;
      STRLEN numbers_len;
      I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-               | PERL_SCAN_DISALLOW_PREFIX
-               /* XXX Until the message is improved in grok_oct, handle errors
-                * ourselves */
-               | PERL_SCAN_SILENT_ILLDIGIT;
+              | PERL_SCAN_DISALLOW_PREFIX
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_SILENT_OVERFLOW;
  
      PERL_ARGS_ASSERT_GROK_BSLASH_O;
  
      assert(*(*s - 1) == '\\');
      assert(* *s       == 'o');
+
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
      (*s)++;
  
-    if (**s != '{') {
-       *error_msg = "Missing braces on \\o{}";
+    if (send <= *s || **s != '{') {
+       *message = "Missing braces on \\o{}";
         return FALSE;
      }
  
@@ -117,7 +296,7 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
          while (isOCTAL(**s)) { /* Position beyond the legal digits */
              (*s)++;
          }
-        *error_msg = "Missing right brace on \\o{";
+        *message = "Missing right brace on \\o{}";
         return FALSE;
      }
  
@@ -125,33 +304,40 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                    sequence if not a digit) */
      numbers_len = e - *s;
      if (numbers_len == 0) {
-        (*s)++;    /* Move past the } */
-       *error_msg = "Number with no digits";
+        (*s)++;    /* Move past the '}' */
+       *message = "Empty \\o{}";
         return FALSE;
      }
  
-    if (silence_non_portable) {
-        flags |= PERL_SCAN_SILENT_NON_PORTABLE;
+    *uv = grok_oct(*s, &numbers_len, &flags, NULL);
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
      }
  
-    *uv = grok_oct(*s, &numbers_len, &flags, NULL);
      /* Note that if has non-octal, will ignore everything starting with that up
       * to the '}' */
-
      if (numbers_len != (STRLEN) (e - *s)) {
+        *s += numbers_len;
          if (strict) {
-            *s += numbers_len;
              *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-            *error_msg = "Non-octal character";
+            *message = "Non-octal character";
              return FALSE;
          }
-        else if (output_warning) {
-            Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
-            /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
-                        "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
-                        *(*s + numbers_len),
-                        (int) numbers_len,
-                        *s);
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
+                                                                      UTF, TRUE);
+            if (packed_warn) {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+            else {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
          }
      }
  
@@ -162,15 +348,16 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
  }
  
  bool
-Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
-                      const char** error_msg,
-                      const bool output_warning, const bool strict,
-                      const bool silence_non_portable,
+Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
+                      const char** message,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                        const bool UTF)
  {
  
  /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
+ *  This returns FALSE if there is an error the caller should probably die
   *  from; otherwise TRUE.
   *  It guarantees that the returned codepoint, *uv, when expressed as
   *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
@@ -187,49 +374,80 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
   *         look beyond.  That is, the function may look at bytes only in the
   *         range *s..send-1
   *     uv  points to a UV that will hold the output value, valid only if the
- *         return from the function is TRUE
- *      error_msg is a pointer that will be set to an internal buffer giving an
- *         error message upon failure (the return is FALSE).  Untouched if
- *         function succeeds
- *     output_warning says whether to output any warning messages, or suppress
- *         them
+ *         return from the function is TRUE; may be changed from the input
+ *         value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
   *     strict is true if anything out of the ordinary should cause this to
   *         fail instead of warn or be silent.  For example, it requires
   *         exactly 2 digits following the \x (when there are no braces).
   *         3 digits could be a mistake, so is forbidden in this mode.
- *      silence_non_portable is true if to suppress warnings about the code
- *          point returned being too large to fit on all platforms.
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
   *     UTF is true iff the string *s is encoded in UTF-8.
   */
      char* e;
      STRLEN numbers_len;
-    I32 flags = PERL_SCAN_DISALLOW_PREFIX;
-
+    I32 flags = PERL_SCAN_DISALLOW_PREFIX
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_NOTIFY_ILLDIGIT
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_OVERFLOW;
  
      PERL_ARGS_ASSERT_GROK_BSLASH_X;
  
      assert(*(*s - 1) == '\\');
      assert(* *s      == 'x');
+
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
      (*s)++;
  
-    if (strict || ! output_warning) {
-        flags |= PERL_SCAN_SILENT_ILLDIGIT;
+    if (send <= *s) {
+        if (strict) {
+            *message = "Empty \\x";
+            return FALSE;
+        }
+
+        /* Sadly, to preserve backcompat, an empty \x at the end of string is
+         * interpreted as a NUL */
+        *uv = 0;
+        return TRUE;
      }
  
      if (**s != '{') {
-        STRLEN len = (strict) ? 3 : 2;
-
-       *uv = grok_hex(*s, &len, &flags, NULL);
-       *s += len;
-        if (strict && len != 2) {
-            if (len < 2) {
-                *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-                *error_msg = "Non-hex character";
+        numbers_len = (strict) ? 3 : 2;
+
+       *uv = grok_hex(*s, &numbers_len, &flags, NULL);
+       *s += numbers_len;
+
+        if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
+            if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
+                *message = "Use \\x{...} for more than two hex characters";
+                return FALSE;
              }
-            else {
-                *error_msg = "Use \\x{...} for more than two hex characters";
+            else if (strict) {
+                    *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+                    *message = "Non-hex character";
+                    return FALSE;
+            }
+            else if (ckWARN(WARN_DIGIT)) {
+                const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                              send, UTF, FALSE);
+
+                if (! packed_warn) {
+                    Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+                }
+                else {
+                    *message = failure;
+                    *packed_warn = packWARN(WARN_DIGIT);
+                }
              }
-            return FALSE;
          }
         return TRUE;
      }
@@ -237,13 +455,10 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
      e = (char *) memchr(*s, '}', send - *s);
      if (!e) {
          (*s)++;  /* Move past the '{' */
-        while (isXDIGIT(**s)) { /* Position beyond the legal digits */
+        while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */
              (*s)++;
          }
-        /* XXX The corresponding message above for \o is just '\\o{'; other
-         * messages for other constructs include the '}', so are inconsistent.
-         */
-       *error_msg = "Missing right brace on \\x{}";
+       *message = "Missing right brace on \\x{}";
         return FALSE;
      }
  
@@ -253,7 +468,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
      if (numbers_len == 0) {
          if (strict) {
              (*s)++;    /* Move past the } */
-            *error_msg = "Number with no digits";
+            *message = "Empty \\x{}";
              return FALSE;
          }
          *s = e + 1;
@@ -262,19 +477,35 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
      }
  
      flags |= PERL_SCAN_ALLOW_UNDERSCORES;
-    if (silence_non_portable) {
-        flags |= PERL_SCAN_SILENT_NON_PORTABLE;
-    }
  
      *uv = grok_hex(*s, &numbers_len, &flags, NULL);
-    /* Note that if has non-hex, will ignore everything starting with that up
-     * to the '}' */
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
  
-    if (strict && numbers_len != (STRLEN) (e - *s)) {
+    if (numbers_len != (STRLEN) (e - *s)) {
          *s += numbers_len;
-        *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-        *error_msg = "Non-hex character";
-        return FALSE;
+        if (strict) {
+            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+            *message = "Non-hex character";
+            return FALSE;
+        }
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                                send, UTF, TRUE);
+            if (! packed_warn) {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
+            else {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+        }
      }
  
      /* Return past the '}' */