3 * This file contains functions that are related to
4 * parsing double-quotish expressions.
9 #define PERL_IN_DQUOTE_C
11 #include "dquote_inline.h"
13 /* XXX Add documentation after final interface and behavior is decided */
16 Perl_grok_bslash_c(pTHX_ const char source,
21 PERL_ARGS_ASSERT_GROK_BSLASH_C;
23 /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it
24 * is valid, the sequence evaluates to a single character, which will be
25 * stored into *result.
27 * source is the character immediately after a '\c' sequence.
28 * result points to a char variable into which this function will store
29 * what the sequence evaluates to, if valid; unchanged otherwise.
30 * message A pointer to any warning or error message will be stored into
31 * this pointer; NULL if none.
32 * packed_warn if NULL on input asks that this routine display any warning
33 * messages. Otherwise, if the function found a warning, the
34 * packed warning categories will be stored into *packed_warn (and
35 * the corresponding message text into *message); 0 if none.
39 if (packed_warn) *packed_warn = 0;
41 if (! isPRINT_A(source)) {
42 *message = "Character following \"\\c\" must be printable ASCII";
47 const char control = toCTRL('{');
48 if (isPRINT_A(control)) {
49 /* diag_listed_as: Use "%s" instead of "%s" */
50 *message = Perl_form(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
53 *message = "Sequence \"\\c{\" invalid";
58 *result = toCTRL(source);
59 if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
62 char format[] = "\"\\c%c\" is more clearly written simply as \"%s\"";
64 if (! isWORDCHAR(*result)) {
67 clearer[i++] = *result;
71 *message = Perl_form(aTHX_ format, source, clearer);
72 *packed_warn = packWARN(WARN_SYNTAX);
75 Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
83 Perl_form_alien_digit_msg(pTHX_
84 const U8 which, /* 8 or 16 */
85 const STRLEN valids_len, /* length of input before first bad char */
86 const char * const first_bad, /* Ptr to that bad char */
87 const char * const send, /* End of input string */
88 const bool UTF, /* Is it in UTF-8? */
89 const bool braced) /* Is it enclosed in {} */
91 /* Generate a mortal SV containing an appropriate warning message about
92 * alien characters found in an octal or hex constant given by the inputs,
93 * and return a pointer to that SV's string. The message looks like:
95 * Non-hex character '?' terminates \x early. Resolved as "\x{...}"
99 /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
100 * (leading zeros could take up more space, and the scalar will
101 * automatically grow if necessary). Space for NUL is added by the newSV()
103 SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
104 SV * message_sv = sv_newmortal();
107 PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
108 assert(which == 8 || which == 16);
110 /* Calculate the display form of the character */
111 if ( UVCHR_IS_INVARIANT(*first_bad)
112 || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
114 pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
115 (STRLEN) -1, UNI_DISPLAY_QQ);
117 else { /* Is not UTF-8, or is illegal UTF-8. Show just the one byte */
119 /* It also isn't a UTF-8 invariant character, so no display shortcuts
120 * are available. Use \\x{...} */
121 Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
124 /* Ready to start building the message */
125 sv_setpvs(message_sv, "Non-");
127 sv_catpvs(message_sv, "octal");
132 symbol = '0'; /* \008, for example */
136 sv_catpvs(message_sv, "hex");
139 sv_catpvs(message_sv, " character ");
141 if (isPRINT(*first_bad)) {
142 sv_catpvs(message_sv, "'");
144 sv_catsv(message_sv, display_char);
145 if (isPRINT(*first_bad)) {
146 sv_catpvs(message_sv, "'");
148 Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early. Resolved as "
149 "\"\\%c", symbol, symbol);
151 sv_catpvs(message_sv, "{");
154 /* Octal constants have an extra leading 0, but \0 already includes that */
155 if (symbol == 'o' && valids_len < 3) {
156 sv_catpvs(message_sv, "0");
158 if (valids_len == 0) { /* No legal digits at all */
159 sv_catpvs(message_sv, "00");
161 else if (valids_len == 1) { /* Just one is legal */
162 sv_catpvs(message_sv, "0");
164 sv_catpvn(message_sv, first_bad - valids_len, valids_len);
167 sv_catpvs(message_sv, "}");
170 sv_catsv(message_sv, display_char);
172 sv_catpvs(message_sv, "\"");
174 SvREFCNT_dec_NN(display_char);
176 return SvPVX_const(message_sv);
180 Perl_form_cp_too_large_msg(pTHX_
181 const U8 which, /* 8 or 16 */
182 const char * string, /* NULL, or the text that is supposed to
183 represent a code point */
184 const Size_t len, /* length of 'string' if not NULL; else 0 */
185 const UV cp) /* 0 if 'string' not NULL; else the too-large
188 /* Generate a mortal SV containing an appropriate warning message about
189 * code points that are too large for this system, given by the inputs,
190 * and return a pointer to that SV's string. Either the text of the string
191 * to be converted to a code point is input, or a code point itself. The
192 * former is needed to accurately represent something that overflows.
194 * The message looks like:
196 * Use of code point %s is not allowed; the permissible max is %s
200 SV * message_sv = sv_newmortal();
204 PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
205 assert(which == 8 || which == 16);
207 /* One but not both must be non-zero */
208 assert((string != NULL) ^ (cp != 0));
209 assert((string == NULL) || len);
220 Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
222 Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
225 Perl_sv_catpvf(aTHX_ message_sv, format, cp);
227 Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
228 Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
230 return SvPVX_const(message_sv);
234 Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
235 const char** message,
238 const bool allow_UV_MAX,
242 /* Documentation to be supplied when interface nailed down finally
243 * This returns FALSE if there is an error the caller should probably die
244 * from; otherwise TRUE.
245 * s is the address of a pointer to a string. **s is 'o', and the
246 * previous character was a backslash. At exit, *s will be advanced
247 * to the byte just after those absorbed by this function. Hence the
248 * caller can continue parsing from there. In the case of an error
249 * when this function returns FALSE, continuing to parse is not an
250 * option, this routine has generally positioned *s to point just to
251 * the right of the first bad spot, so that a message that has a "<--"
252 * to mark the spot will be correctly positioned.
253 * send - 1 gives a limit in *s that this function is not permitted to
254 * look beyond. That is, the function may look at bytes only in the
256 * uv points to a UV that will hold the output value, valid only if the
257 * return from the function is TRUE; may be changed from the input
258 * value even when FALSE is returned.
259 * message A pointer to any warning or error message will be stored into
260 * this pointer; NULL if none.
261 * packed_warn if NULL on input asks that this routine display any warning
262 * messages. Otherwise, if the function found a warning, the packed
263 * warning categories will be stored into *packed_warn (and the
264 * corresponding message text into *message); 0 if none.
265 * strict is true if this should fail instead of warn if there are
266 * non-octal digits within the braces
267 * allow_UV_MAX is true if this shouldn't fail if the input code point is
268 * UV_MAX, which is normally illegal, reserved for internal use.
269 * UTF is true iff the string *s is encoded in UTF-8.
273 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
274 | PERL_SCAN_DISALLOW_PREFIX
275 | PERL_SCAN_SILENT_NON_PORTABLE
276 | PERL_SCAN_SILENT_ILLDIGIT
277 | PERL_SCAN_SILENT_OVERFLOW;
279 PERL_ARGS_ASSERT_GROK_BSLASH_O;
281 assert(*(*s - 1) == '\\');
285 if (packed_warn) *packed_warn = 0;
289 if (send <= *s || **s != '{') {
290 *message = "Missing braces on \\o{}";
294 e = (char *) memchr(*s, '}', send - *s);
296 (*s)++; /* Move past the '{' */
297 while (isOCTAL(**s)) { /* Position beyond the legal digits */
300 *message = "Missing right brace on \\o{}";
304 (*s)++; /* Point to expected first digit (could be first byte of utf8
305 sequence if not a digit) */
306 numbers_len = e - *s;
307 if (numbers_len == 0) {
308 (*s)++; /* Move past the '}' */
309 *message = "Empty \\o{}";
313 *uv = grok_oct(*s, &numbers_len, &flags, NULL);
314 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
315 || (! allow_UV_MAX && *uv == UV_MAX)))
317 *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
322 /* Note that if has non-octal, will ignore everything starting with that up
324 if (numbers_len != (STRLEN) (e - *s)) {
327 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
328 *message = "Non-octal character";
332 if (ckWARN(WARN_DIGIT)) {
333 const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
337 *packed_warn = packWARN(WARN_DIGIT);
340 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
345 /* Return past the '}' */
352 Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
353 const char** message,
356 const bool allow_UV_MAX,
360 /* Documentation to be supplied when interface nailed down finally
361 * This returns FALSE if there is an error the caller should probably die
362 * from; otherwise TRUE.
363 * It guarantees that the returned codepoint, *uv, when expressed as
364 * utf8 bytes, would fit within the skipped "\x{...}" bytes.
367 * s is the address of a pointer to a string. **s is 'x', and the
368 * previous character was a backslash. At exit, *s will be advanced
369 * to the byte just after those absorbed by this function. Hence the
370 * caller can continue parsing from there. In the case of an error,
371 * this routine has generally positioned *s to point just to the right
372 * of the first bad spot, so that a message that has a "<--" to mark
373 * the spot will be correctly positioned.
374 * send - 1 gives a limit in *s that this function is not permitted to
375 * look beyond. That is, the function may look at bytes only in the
377 * uv points to a UV that will hold the output value, valid only if the
378 * return from the function is TRUE; may be changed from the input
379 * value even when FALSE is returned.
380 * message A pointer to any warning or error message will be stored into
381 * this pointer; NULL if none.
382 * packed_warn if NULL on input asks that this routine display any warning
383 * messages. Otherwise, if the function found a warning, the packed
384 * warning categories will be stored into *packed_warn (and the
385 * corresponding message text into *message); 0 if none.
386 * strict is true if anything out of the ordinary should cause this to
387 * fail instead of warn or be silent. For example, it requires
388 * exactly 2 digits following the \x (when there are no braces).
389 * 3 digits could be a mistake, so is forbidden in this mode.
390 * allow_UV_MAX is true if this shouldn't fail if the input code point is
391 * UV_MAX, which is normally illegal, reserved for internal use.
392 * UTF is true iff the string *s is encoded in UTF-8.
396 I32 flags = PERL_SCAN_DISALLOW_PREFIX
397 | PERL_SCAN_SILENT_ILLDIGIT
398 | PERL_SCAN_NOTIFY_ILLDIGIT
399 | PERL_SCAN_SILENT_NON_PORTABLE
400 | PERL_SCAN_SILENT_OVERFLOW;
402 PERL_ARGS_ASSERT_GROK_BSLASH_X;
404 assert(*(*s - 1) == '\\');
408 if (packed_warn) *packed_warn = 0;
414 *message = "Empty \\x";
418 /* Sadly, to preserve backcompat, an empty \x at the end of string is
419 * interpreted as a NUL */
425 numbers_len = (strict) ? 3 : 2;
427 *uv = grok_hex(*s, &numbers_len, &flags, NULL);
430 if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
431 if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
432 *message = "Use \\x{...} for more than two hex characters";
436 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
437 *message = "Non-hex character";
440 else if (ckWARN(WARN_DIGIT)) {
441 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
445 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
449 *packed_warn = packWARN(WARN_DIGIT);
456 e = (char *) memchr(*s, '}', send - *s);
458 (*s)++; /* Move past the '{' */
459 while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */
462 *message = "Missing right brace on \\x{}";
466 (*s)++; /* Point to expected first digit (could be first byte of utf8
467 sequence if not a digit) */
468 numbers_len = e - *s;
469 if (numbers_len == 0) {
471 (*s)++; /* Move past the } */
472 *message = "Empty \\x{}";
480 flags |= PERL_SCAN_ALLOW_UNDERSCORES;
482 *uv = grok_hex(*s, &numbers_len, &flags, NULL);
483 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
484 || (! allow_UV_MAX && *uv == UV_MAX)))
486 *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
491 if (numbers_len != (STRLEN) (e - *s)) {
494 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
495 *message = "Non-hex character";
499 if (ckWARN(WARN_DIGIT)) {
500 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
503 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
507 *packed_warn = packWARN(WARN_DIGIT);
512 /* Return past the '}' */
519 * ex: set ts=8 sts=4 sw=4 et: