X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/7f4ec488653048a5623702c419020a9402453c6a..HEAD:/dquote.c diff --git a/dquote.c b/dquote.c index 19a7a37..e2c03f6 100644 --- a/dquote.c +++ b/dquote.c @@ -8,269 +8,555 @@ #include "EXTERN.h" #define PERL_IN_DQUOTE_C #include "perl.h" -#include "dquote_inline.h" /* XXX Add documentation after final interface and behavior is decided */ -/* May want to show context for error, so would pass S_grok_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning) - U8 source = *current; -*/ -char -Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) +bool +Perl_grok_bslash_c(pTHX_ const char source, + U8 * result, + const char** message, + U32 * packed_warn) { - - U8 result; + PERL_ARGS_ASSERT_GROK_BSLASH_C; + + /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it + * is valid, the sequence evaluates to a single character, which will be + * stored into *result. + * + * source is the character immediately after a '\c' sequence. + * result points to a char variable into which this function will store + * what the sequence evaluates to, if valid; unchanged otherwise. + * message A pointer to any warning or error message will be stored into + * this pointer; NULL if none. + * packed_warn if NULL on input asks that this routine display any warning + * messages. Otherwise, if the function found a warning, the + * packed warning categories will be stored into *packed_warn (and + * the corresponding message text into *message); 0 if none. + */ + + *message = NULL; + if (packed_warn) *packed_warn = 0; if (! isPRINT_A(source)) { - Perl_croak(aTHX_ "%s", - "Character following \"\\c\" must be printable ASCII"); + *message = "Character following \"\\c\" must be printable ASCII"; + return FALSE; } - else if (source == '{') { + + if (source == '{') { const char control = toCTRL('{'); if (isPRINT_A(control)) { /* diag_listed_as: Use "%s" instead of "%s" */ - Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", control); + *message = Perl_form(aTHX_ PERL_DIAG_DIE_SYNTAX("Use \"%c\" instead of \"\\c{\""), control); } else { - Perl_croak(aTHX_ "Sequence \"\\c{\" invalid"); + *message = "Sequence \"\\c{\" invalid"; } + return FALSE; } - result = toCTRL(source); - if (output_warning && isPRINT_A(result)) { + *result = toCTRL(source); + if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) { U8 clearer[3]; U8 i = 0; - if (! isWORDCHAR(result)) { + char format[] = PERL_DIAG_WARN_SYNTAX("\"\\c%c\" is more clearly written simply as \"%s\""); + + if (! isWORDCHAR(*result)) { clearer[i++] = '\\'; } - clearer[i++] = result; + clearer[i++] = *result; clearer[i++] = '\0'; - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); + if (packed_warn) { + *message = Perl_form(aTHX_ format, source, clearer); + *packed_warn = packWARN(WARN_SYNTAX); + } + else { + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer); + } } - return result; + return TRUE; +} + +const char * +Perl_form_alien_digit_msg(pTHX_ + const U8 which, /* 8 or 16 */ + const STRLEN valids_len, /* length of input before first bad char */ + const char * const first_bad, /* Ptr to that bad char */ + const char * const send, /* End of input string */ + const bool UTF, /* Is it in UTF-8? */ + const bool braced) /* Is it enclosed in {} */ +{ + /* Generate a mortal SV containing an appropriate warning message about + * alien characters found in an octal or hex constant given by the inputs, + * and return a pointer to that SV's string. The message looks like: + * + * Non-hex character '?' terminates \x early. Resolved as "\x{...}" + * + */ + + /* The usual worst case scenario: 2 chars to display per byte, plus \x{} + * (leading zeros could take up more space, and the scalar will + * automatically grow if necessary). Space for NUL is added by the newSV() + * function */ + SV * display_char = newSV(2 * UTF8_MAXBYTES + 4); + SV * message_sv = sv_newmortal(); + char symbol; + + PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG; + assert(which == 8 || which == 16); + + /* Calculate the display form of the character */ + if ( UVCHR_IS_INVARIANT(*first_bad) + || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send))) + { + pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad), + (STRLEN) -1, UNI_DISPLAY_QQ); + } + else { /* Is not UTF-8, or is illegal UTF-8. Show just the one byte */ + + /* It also isn't a UTF-8 invariant character, so no display shortcuts + * are available. Use \\x{...} */ + Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad); + } + + /* Ready to start building the message */ + sv_setpvs(message_sv, "Non-"); + if (which == 8) { + sv_catpvs(message_sv, "octal"); + if (braced) { + symbol = 'o'; + } + else { + symbol = '0'; /* \008, for example */ + } + } + else { + sv_catpvs(message_sv, "hex"); + symbol = 'x'; + } + sv_catpvs(message_sv, " character "); + + if (isPRINT(*first_bad)) { + sv_catpvs(message_sv, "'"); + } + sv_catsv(message_sv, display_char); + if (isPRINT(*first_bad)) { + sv_catpvs(message_sv, "'"); + } + Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early. Resolved as " + "\"\\%c", symbol, symbol); + if (braced) { + sv_catpvs(message_sv, "{"); + } + + /* Octal constants have an extra leading 0, but \0 already includes that */ + if (symbol == 'o' && valids_len < 3) { + sv_catpvs(message_sv, "0"); + } + if (valids_len == 0) { /* No legal digits at all */ + sv_catpvs(message_sv, "00"); + } + else if (valids_len == 1) { /* Just one is legal */ + sv_catpvs(message_sv, "0"); + } + sv_catpvn(message_sv, first_bad - valids_len, valids_len); + + if (braced) { + sv_catpvs(message_sv, "}"); + } + else { + sv_catsv(message_sv, display_char); + } + sv_catpvs(message_sv, "\""); + + SvREFCNT_dec_NN(display_char); + + return SvPVX_const(message_sv); +} + +const char * +Perl_form_cp_too_large_msg(pTHX_ + const U8 which, /* 8 or 16 */ + const char * string, /* NULL, or the text that is supposed to + represent a code point */ + const Size_t len, /* length of 'string' if not NULL; else 0 */ + const UV cp) /* 0 if 'string' not NULL; else the too-large + code point */ +{ + /* Generate a mortal SV containing an appropriate warning message about + * code points that are too large for this system, given by the inputs, + * and return a pointer to that SV's string. Either the text of the string + * to be converted to a code point is input, or a code point itself. The + * former is needed to accurately represent something that overflows. + * + * The message looks like: + * + * Use of code point %s is not allowed; the permissible max is %s + * + */ + + SV * message_sv = sv_newmortal(); + const char * format; + const char * prefix; + + PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG; + assert(which == 8 || which == 16); + + /* One but not both must be non-zero */ + assert((string != NULL) ^ (cp != 0)); + assert((string == NULL) || len); + + if (which == 8) { + format = "%" UVof; + prefix = "0"; + } + else { + format = "%" UVXf; + prefix = "0x"; + } + + Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix); + if (string) { + Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string); + } + else { + Perl_sv_catpvf(aTHX_ message_sv, format, cp); + } + Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix); + Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP); + + return SvPVX_const(message_sv); } bool -Perl_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg, - const bool output_warning, const bool strict, - const bool silence_non_portable, +Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv, + const char** message, + U32 * packed_warn, + const bool strict, + const bool allow_UV_MAX, const bool UTF) { /* Documentation to be supplied when interface nailed down finally - * This returns FALSE if there is an error which the caller need not recover - * from; otherwise TRUE. In either case the caller should look at *len [???]. - * It guarantees that the returned codepoint, *uv, when expressed as - * utf8 bytes, would fit within the skipped "\o{...}" bytes. - * On input: - * s is the address of a pointer to a NULL terminated string that begins - * with 'o', and the previous character was a backslash. At exit, *s - * will be advanced to the byte just after those absorbed by this - * function. Hence the caller can continue parsing from there. In - * the case of an error, this routine has generally positioned *s to - * point just to the right of the first bad spot, so that a message - * that has a "<--" to mark the spot will be correctly positioned. + * This returns FALSE if there is an error the caller should probably die + * from; otherwise TRUE. + * s is the address of a pointer to a string. **s is 'o', and the + * previous character was a backslash. At exit, *s will be advanced + * to the byte just after those absorbed by this function. Hence the + * caller can continue parsing from there. In the case of an error + * when this function returns FALSE, continuing to parse is not an + * option, this routine has generally positioned *s to point just to + * the right of the first bad spot, so that a message that has a "<--" + * to mark the spot will be correctly positioned. + * send - 1 gives a limit in *s that this function is not permitted to + * look beyond. That is, the function may look at bytes only in the + * range *s..send-1 * uv points to a UV that will hold the output value, valid only if the - * return from the function is TRUE - * error_msg is a pointer that will be set to an internal buffer giving an - * error message upon failure (the return is FALSE). Untouched if - * function succeeds - * output_warning says whether to output any warning messages, or suppress - * them + * return from the function is TRUE; may be changed from the input + * value even when FALSE is returned. + * message A pointer to any warning or error message will be stored into + * this pointer; NULL if none. + * packed_warn if NULL on input asks that this routine display any warning + * messages. Otherwise, if the function found a warning, the packed + * warning categories will be stored into *packed_warn (and the + * corresponding message text into *message); 0 if none. * strict is true if this should fail instead of warn if there are * non-octal digits within the braces - * silence_non_portable is true if to suppress warnings about the code - * point returned being too large to fit on all platforms. + * allow_UV_MAX is true if this shouldn't fail if the input code point is + * UV_MAX, which is normally illegal, reserved for internal use. * UTF is true iff the string *s is encoded in UTF-8. */ - char* e; + char * e; + char * rbrace; STRLEN numbers_len; + STRLEN trailing_blanks_len = 0; I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - /* XXX Until the message is improved in grok_oct, handle errors - * ourselves */ - | PERL_SCAN_SILENT_ILLDIGIT; + | PERL_SCAN_DISALLOW_PREFIX + | PERL_SCAN_SILENT_NON_PORTABLE + | PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_SILENT_OVERFLOW; PERL_ARGS_ASSERT_GROK_BSLASH_O; assert(*(*s - 1) == '\\'); assert(* *s == 'o'); + + *message = NULL; + if (packed_warn) *packed_warn = 0; + (*s)++; - if (**s != '{') { - *error_msg = "Missing braces on \\o{}"; - return FALSE; + if (send <= *s || **s != '{') { + *message = "Missing braces on \\o{}"; + return FALSE; } - e = strchr(*s, '}'); - if (!e) { + rbrace = (char *) memchr(*s, '}', send - *s); + if (!rbrace) { (*s)++; /* Move past the '{' */ - while (isOCTAL(**s)) { /* Position beyond the legal digits */ + + /* Position beyond the legal digits and blanks */ + while (*s < send && isBLANK(**s)) { (*s)++; } - *error_msg = "Missing right brace on \\o{"; - return FALSE; + + while (*s < send && isOCTAL(**s)) { + (*s)++; + } + + *message = "Missing right brace on \\o{}"; + return FALSE; + } + + /* Point to expected first digit (could be first byte of utf8 sequence if + * not a digit) */ + (*s)++; + while (isBLANK(**s)) { + (*s)++; + } + + e = rbrace; + while (*s < e && isBLANK(*(e - 1))) { + e--; } - (*s)++; /* Point to expected first digit (could be first byte of utf8 - sequence if not a digit) */ numbers_len = e - *s; if (numbers_len == 0) { - (*s)++; /* Move past the } */ - *error_msg = "Number with no digits"; - return FALSE; + (*s)++; /* Move past the '}' */ + *message = "Empty \\o{}"; + return FALSE; } - if (silence_non_portable) { - flags |= PERL_SCAN_SILENT_NON_PORTABLE; + *uv = grok_oct(*s, &numbers_len, &flags, NULL); + if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX) + || (! allow_UV_MAX && *uv == UV_MAX))) + { + *message = form_cp_too_large_msg(8, *s, numbers_len, 0); + *s = rbrace + 1; + return FALSE; + } + + while (isBLANK(**s)) { + trailing_blanks_len++; + (*s)++; } - *uv = grok_oct(*s, &numbers_len, &flags, NULL); /* Note that if has non-octal, will ignore everything starting with that up * to the '}' */ - - if (numbers_len != (STRLEN) (e - *s)) { + if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) { + *s += numbers_len; if (strict) { - *s += numbers_len; - *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; - *error_msg = "Non-octal character"; + *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; + *message = "Non-octal character"; return FALSE; } - else if (output_warning) { - Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), - /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ - "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", - *(*s + numbers_len), - (int) numbers_len, - *s); + + if (ckWARN(WARN_DIGIT)) { + const char * failure = form_alien_digit_msg(8, numbers_len, *s, send, + UTF, TRUE); + if (packed_warn) { + *message = failure; + *packed_warn = packWARN(WARN_DIGIT); + } + else { + Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); + } } } /* Return past the '}' */ - *s = e + 1; + *s = rbrace + 1; return TRUE; } bool -Perl_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg, - const bool output_warning, const bool strict, - const bool silence_non_portable, +Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv, + const char** message, + U32 * packed_warn, + const bool strict, + const bool allow_UV_MAX, const bool UTF) { /* Documentation to be supplied when interface nailed down finally - * This returns FALSE if there is an error which the caller need not recover + * This returns FALSE if there is an error the caller should probably die * from; otherwise TRUE. * It guarantees that the returned codepoint, *uv, when expressed as * utf8 bytes, would fit within the skipped "\x{...}" bytes. * * On input: - * s is the address of a pointer to a NULL terminated string that begins - * with 'x', and the previous character was a backslash. At exit, *s - * will be advanced to the byte just after those absorbed by this - * function. Hence the caller can continue parsing from there. In - * the case of an error, this routine has generally positioned *s to - * point just to the right of the first bad spot, so that a message - * that has a "<--" to mark the spot will be correctly positioned. + * s is the address of a pointer to a string. **s is 'x', and the + * previous character was a backslash. At exit, *s will be advanced + * to the byte just after those absorbed by this function. Hence the + * caller can continue parsing from there. In the case of an error, + * this routine has generally positioned *s to point just to the right + * of the first bad spot, so that a message that has a "<--" to mark + * the spot will be correctly positioned. + * send - 1 gives a limit in *s that this function is not permitted to + * look beyond. That is, the function may look at bytes only in the + * range *s..send-1 * uv points to a UV that will hold the output value, valid only if the - * return from the function is TRUE - * error_msg is a pointer that will be set to an internal buffer giving an - * error message upon failure (the return is FALSE). Untouched if - * function succeeds - * output_warning says whether to output any warning messages, or suppress - * them + * return from the function is TRUE; may be changed from the input + * value even when FALSE is returned. + * message A pointer to any warning or error message will be stored into + * this pointer; NULL if none. + * packed_warn if NULL on input asks that this routine display any warning + * messages. Otherwise, if the function found a warning, the packed + * warning categories will be stored into *packed_warn (and the + * corresponding message text into *message); 0 if none. * strict is true if anything out of the ordinary should cause this to * fail instead of warn or be silent. For example, it requires * exactly 2 digits following the \x (when there are no braces). * 3 digits could be a mistake, so is forbidden in this mode. - * silence_non_portable is true if to suppress warnings about the code - * point returned being too large to fit on all platforms. + * allow_UV_MAX is true if this shouldn't fail if the input code point is + * UV_MAX, which is normally illegal, reserved for internal use. * UTF is true iff the string *s is encoded in UTF-8. */ char* e; + char * rbrace; STRLEN numbers_len; - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - + STRLEN trailing_blanks_len = 0; + I32 flags = PERL_SCAN_DISALLOW_PREFIX + | PERL_SCAN_SILENT_ILLDIGIT + | PERL_SCAN_NOTIFY_ILLDIGIT + | PERL_SCAN_SILENT_NON_PORTABLE + | PERL_SCAN_SILENT_OVERFLOW; PERL_ARGS_ASSERT_GROK_BSLASH_X; assert(*(*s - 1) == '\\'); assert(* *s == 'x'); + + *message = NULL; + if (packed_warn) *packed_warn = 0; + (*s)++; - if (strict || ! output_warning) { - flags |= PERL_SCAN_SILENT_ILLDIGIT; + if (send <= *s) { + if (strict) { + *message = "Empty \\x"; + return FALSE; + } + + /* Sadly, to preserve backcompat, an empty \x at the end of string is + * interpreted as a NUL */ + *uv = 0; + return TRUE; } if (**s != '{') { - STRLEN len = (strict) ? 3 : 2; - - *uv = grok_hex(*s, &len, &flags, NULL); - *s += len; - if (strict && len != 2) { - if (len < 2) { - *s += (UTF) ? UTF8SKIP(*s) : 1; - *error_msg = "Non-hex character"; + numbers_len = (strict) ? 3 : 2; + + *uv = grok_hex(*s, &numbers_len, &flags, NULL); + *s += numbers_len; + + if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) { + if (numbers_len == 3) { /* numbers_len 3 only happens with strict */ + *message = "Use \\x{...} for more than two hex characters"; + return FALSE; } - else { - *error_msg = "Use \\x{...} for more than two hex characters"; + else if (strict) { + *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; + *message = "Non-hex character"; + return FALSE; + } + else if (ckWARN(WARN_DIGIT)) { + const char * failure = form_alien_digit_msg(16, numbers_len, *s, + send, UTF, FALSE); + + if (! packed_warn) { + Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); + } + else { + *message = failure; + *packed_warn = packWARN(WARN_DIGIT); + } } - return FALSE; } - return TRUE; + return TRUE; } - e = strchr(*s, '}'); - if (!e) { + rbrace = (char *) memchr(*s, '}', send - *s); + if (!rbrace) { (*s)++; /* Move past the '{' */ - while (isXDIGIT(**s)) { /* Position beyond the legal digits */ + + /* Position beyond legal blanks and digits */ + while (*s < send && isBLANK(**s)) { (*s)++; } - /* XXX The corresponding message above for \o is just '\\o{'; other - * messages for other constructs include the '}', so are inconsistent. - */ - *error_msg = "Missing right brace on \\x{}"; - return FALSE; + + while (*s < send && isXDIGIT(**s)) { + (*s)++; + } + + *message = "Missing right brace on \\x{}"; + return FALSE; } (*s)++; /* Point to expected first digit (could be first byte of utf8 sequence if not a digit) */ + while (isBLANK(**s)) { + (*s)++; + } + + e = rbrace; + while (*s < e && isBLANK(*(e - 1))) { + e--; + } + numbers_len = e - *s; if (numbers_len == 0) { if (strict) { (*s)++; /* Move past the } */ - *error_msg = "Number with no digits"; + *message = "Empty \\x{}"; return FALSE; } - *s = e + 1; + *s = rbrace + 1; *uv = 0; return TRUE; } flags |= PERL_SCAN_ALLOW_UNDERSCORES; - if (silence_non_portable) { - flags |= PERL_SCAN_SILENT_NON_PORTABLE; - } *uv = grok_hex(*s, &numbers_len, &flags, NULL); - /* Note that if has non-hex, will ignore everything starting with that up - * to the '}' */ + if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX) + || (! allow_UV_MAX && *uv == UV_MAX))) + { + *message = form_cp_too_large_msg(16, *s, numbers_len, 0); + *s = e + 1; + return FALSE; + } - if (strict && numbers_len != (STRLEN) (e - *s)) { + while (isBLANK(**s)) { + trailing_blanks_len++; + (*s)++; + } + + if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) { *s += numbers_len; - *s += (UTF) ? UTF8SKIP(*s) : 1; - *error_msg = "Non-hex character"; - return FALSE; + if (strict) { + *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; + *message = "Non-hex character"; + return FALSE; + } + + if (ckWARN(WARN_DIGIT)) { + const char * failure = form_alien_digit_msg(16, numbers_len, *s, + send, UTF, TRUE); + if (! packed_warn) { + Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); + } + else { + *message = failure; + *packed_warn = packWARN(WARN_DIGIT); + } + } } /* Return past the '}' */ - *s = e + 1; + *s = rbrace + 1; return TRUE; }