This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Promote v5.36 usage and feature bundles doc
[perl5.git] / dquote.c
CommitLineData
a55c5245 1/* dquote.c
04e98a4d 2 *
a55c5245
JH
3 * This file contains functions that are related to
4 * parsing double-quotish expressions.
04e98a4d 5 *
04e98a4d
AD
6*/
7
a55c5245
JH
8#include "EXTERN.h"
9#define PERL_IN_DQUOTE_C
10#include "perl.h"
881ffab6 11
68b355dd 12/* XXX Add documentation after final interface and behavior is decided */
68b355dd 13
73351a71
KW
14bool
15Perl_grok_bslash_c(pTHX_ const char source,
16 U8 * result,
17 const char** message,
18 U32 * packed_warn)
68b355dd 19{
73351a71 20 PERL_ARGS_ASSERT_GROK_BSLASH_C;
68b355dd 21
73351a71
KW
22 /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it
23 * is valid, the sequence evaluates to a single character, which will be
24 * stored into *result.
25 *
26 * source is the character immediately after a '\c' sequence.
27 * result points to a char variable into which this function will store
28 * what the sequence evaluates to, if valid; unchanged otherwise.
29 * message A pointer to any warning or error message will be stored into
30 * this pointer; NULL if none.
31 * packed_warn if NULL on input asks that this routine display any warning
32 * messages. Otherwise, if the function found a warning, the
33 * packed warning categories will be stored into *packed_warn (and
34 * the corresponding message text into *message); 0 if none.
35 */
36
37 *message = NULL;
38 if (packed_warn) *packed_warn = 0;
68b355dd 39
421e43ba 40 if (! isPRINT_A(source)) {
73351a71
KW
41 *message = "Character following \"\\c\" must be printable ASCII";
42 return FALSE;
68b355dd 43 }
73351a71
KW
44
45 if (source == '{') {
a27ed980
KW
46 const char control = toCTRL('{');
47 if (isPRINT_A(control)) {
48 /* diag_listed_as: Use "%s" instead of "%s" */
73351a71 49 *message = Perl_form(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
a27ed980
KW
50 }
51 else {
73351a71 52 *message = "Sequence \"\\c{\" invalid";
a27ed980 53 }
73351a71 54 return FALSE;
421e43ba 55 }
68b355dd 56
73351a71
KW
57 *result = toCTRL(source);
58 if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
4d8be631
KW
59 U8 clearer[3];
60 U8 i = 0;
73351a71
KW
61 char format[] = "\"\\c%c\" is more clearly written simply as \"%s\"";
62
63 if (! isWORDCHAR(*result)) {
4d8be631
KW
64 clearer[i++] = '\\';
65 }
73351a71 66 clearer[i++] = *result;
4d8be631 67 clearer[i++] = '\0';
68b355dd 68
73351a71
KW
69 if (packed_warn) {
70 *message = Perl_form(aTHX_ format, source, clearer);
71 *packed_warn = packWARN(WARN_SYNTAX);
72 }
73 else {
74 Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
75 }
68b355dd
KW
76 }
77
73351a71 78 return TRUE;
68b355dd
KW
79}
80
8d1e72f0
KW
81const char *
82Perl_form_alien_digit_msg(pTHX_
83 const U8 which, /* 8 or 16 */
84 const STRLEN valids_len, /* length of input before first bad char */
85 const char * const first_bad, /* Ptr to that bad char */
86 const char * const send, /* End of input string */
87 const bool UTF, /* Is it in UTF-8? */
88 const bool braced) /* Is it enclosed in {} */
89{
90 /* Generate a mortal SV containing an appropriate warning message about
91 * alien characters found in an octal or hex constant given by the inputs,
92 * and return a pointer to that SV's string. The message looks like:
93 *
94 * Non-hex character '?' terminates \x early. Resolved as "\x{...}"
95 *
96 */
97
98 /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
99 * (leading zeros could take up more space, and the scalar will
100 * automatically grow if necessary). Space for NUL is added by the newSV()
101 * function */
102 SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
103 SV * message_sv = sv_newmortal();
104 char symbol;
105
106 PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
107 assert(which == 8 || which == 16);
108
109 /* Calculate the display form of the character */
110 if ( UVCHR_IS_INVARIANT(*first_bad)
111 || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
112 {
113 pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
114 (STRLEN) -1, UNI_DISPLAY_QQ);
115 }
116 else { /* Is not UTF-8, or is illegal UTF-8. Show just the one byte */
117
118 /* It also isn't a UTF-8 invariant character, so no display shortcuts
119 * are available. Use \\x{...} */
1f4fbd3b 120 Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
8d1e72f0
KW
121 }
122
123 /* Ready to start building the message */
124 sv_setpvs(message_sv, "Non-");
125 if (which == 8) {
126 sv_catpvs(message_sv, "octal");
127 if (braced) {
128 symbol = 'o';
129 }
130 else {
131 symbol = '0'; /* \008, for example */
132 }
133 }
134 else {
135 sv_catpvs(message_sv, "hex");
136 symbol = 'x';
137 }
138 sv_catpvs(message_sv, " character ");
139
140 if (isPRINT(*first_bad)) {
141 sv_catpvs(message_sv, "'");
142 }
143 sv_catsv(message_sv, display_char);
144 if (isPRINT(*first_bad)) {
145 sv_catpvs(message_sv, "'");
146 }
147 Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early. Resolved as "
148 "\"\\%c", symbol, symbol);
149 if (braced) {
150 sv_catpvs(message_sv, "{");
151 }
152
153 /* Octal constants have an extra leading 0, but \0 already includes that */
154 if (symbol == 'o' && valids_len < 3) {
155 sv_catpvs(message_sv, "0");
156 }
157 if (valids_len == 0) { /* No legal digits at all */
158 sv_catpvs(message_sv, "00");
159 }
160 else if (valids_len == 1) { /* Just one is legal */
161 sv_catpvs(message_sv, "0");
162 }
163 sv_catpvn(message_sv, first_bad - valids_len, valids_len);
164
165 if (braced) {
166 sv_catpvs(message_sv, "}");
167 }
168 else {
169 sv_catsv(message_sv, display_char);
170 }
171 sv_catpvs(message_sv, "\"");
172
173 SvREFCNT_dec_NN(display_char);
174
175 return SvPVX_const(message_sv);
176}
177
178const char *
179Perl_form_cp_too_large_msg(pTHX_
180 const U8 which, /* 8 or 16 */
181 const char * string, /* NULL, or the text that is supposed to
182 represent a code point */
183 const Size_t len, /* length of 'string' if not NULL; else 0 */
184 const UV cp) /* 0 if 'string' not NULL; else the too-large
185 code point */
186{
187 /* Generate a mortal SV containing an appropriate warning message about
188 * code points that are too large for this system, given by the inputs,
189 * and return a pointer to that SV's string. Either the text of the string
190 * to be converted to a code point is input, or a code point itself. The
191 * former is needed to accurately represent something that overflows.
192 *
193 * The message looks like:
194 *
195 * Use of code point %s is not allowed; the permissible max is %s
196 *
197 */
198
199 SV * message_sv = sv_newmortal();
200 const char * format;
201 const char * prefix;
202
203 PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
204 assert(which == 8 || which == 16);
205
206 /* One but not both must be non-zero */
207 assert((string != NULL) ^ (cp != 0));
208 assert((string == NULL) || len);
209
210 if (which == 8) {
211 format = "%" UVof;
212 prefix = "0";
213 }
214 else {
215 format = "%" UVXf;
216 prefix = "0x";
217 }
218
219 Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
220 if (string) {
221 Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
222 }
223 else {
224 Perl_sv_catpvf(aTHX_ message_sv, format, cp);
225 }
226 Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
227 Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
228
229 return SvPVX_const(message_sv);
230}
231
a55c5245 232bool
e8278639 233Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
5763c818 234 const char** message,
8d1e72f0
KW
235 U32 * packed_warn,
236 const bool strict,
237 const bool allow_UV_MAX,
80f4111b 238 const bool UTF)
db30362b
KW
239{
240
241/* Documentation to be supplied when interface nailed down finally
8d1e72f0
KW
242 * This returns FALSE if there is an error the caller should probably die
243 * from; otherwise TRUE.
e8278639
KW
244 * s is the address of a pointer to a string. **s is 'o', and the
245 * previous character was a backslash. At exit, *s will be advanced
246 * to the byte just after those absorbed by this function. Hence the
8d1e72f0
KW
247 * caller can continue parsing from there. In the case of an error
248 * when this function returns FALSE, continuing to parse is not an
249 * option, this routine has generally positioned *s to point just to
250 * the right of the first bad spot, so that a message that has a "<--"
251 * to mark the spot will be correctly positioned.
e8278639
KW
252 * send - 1 gives a limit in *s that this function is not permitted to
253 * look beyond. That is, the function may look at bytes only in the
254 * range *s..send-1
db30362b 255 * uv points to a UV that will hold the output value, valid only if the
8d1e72f0
KW
256 * return from the function is TRUE; may be changed from the input
257 * value even when FALSE is returned.
258 * message A pointer to any warning or error message will be stored into
259 * this pointer; NULL if none.
260 * packed_warn if NULL on input asks that this routine display any warning
261 * messages. Otherwise, if the function found a warning, the packed
262 * warning categories will be stored into *packed_warn (and the
263 * corresponding message text into *message); 0 if none.
80f4111b
KW
264 * strict is true if this should fail instead of warn if there are
265 * non-octal digits within the braces
8d1e72f0
KW
266 * allow_UV_MAX is true if this shouldn't fail if the input code point is
267 * UV_MAX, which is normally illegal, reserved for internal use.
80f4111b 268 * UTF is true iff the string *s is encoded in UTF-8.
db30362b 269 */
1b2f32d5 270 char * e;
fa2251a9 271 char * rbrace;
db30362b 272 STRLEN numbers_len;
1b2f32d5 273 STRLEN trailing_blanks_len = 0;
db30362b 274 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8d1e72f0
KW
275 | PERL_SCAN_DISALLOW_PREFIX
276 | PERL_SCAN_SILENT_NON_PORTABLE
277 | PERL_SCAN_SILENT_ILLDIGIT
278 | PERL_SCAN_SILENT_OVERFLOW;
db30362b
KW
279
280 PERL_ARGS_ASSERT_GROK_BSLASH_O;
281
7f4ec488
KW
282 assert(*(*s - 1) == '\\');
283 assert(* *s == 'o');
8d1e72f0
KW
284
285 *message = NULL;
286 if (packed_warn) *packed_warn = 0;
287
00ce5563 288 (*s)++;
db30362b 289
83a1b28e 290 if (send <= *s || **s != '{') {
1f4fbd3b
MS
291 *message = "Missing braces on \\o{}";
292 return FALSE;
db30362b
KW
293 }
294
fa2251a9
KW
295 rbrace = (char *) memchr(*s, '}', send - *s);
296 if (!rbrace) {
00ce5563 297 (*s)++; /* Move past the '{' */
1b2f32d5
KW
298
299 /* Position beyond the legal digits and blanks */
300 while (*s < send && isBLANK(**s)) {
301 (*s)++;
302 }
303
304 while (*s < send && isOCTAL(**s)) {
b8de99ca
KW
305 (*s)++;
306 }
1b2f32d5 307
3b34e85b 308 *message = "Missing right brace on \\o{}";
1f4fbd3b 309 return FALSE;
db30362b
KW
310 }
311
1b2f32d5
KW
312 /* Point to expected first digit (could be first byte of utf8 sequence if
313 * not a digit) */
314 (*s)++;
315 while (isBLANK(**s)) {
316 (*s)++;
317 }
318
319 e = rbrace;
320 while (*s < e && isBLANK(*(e - 1))) {
321 e--;
322 }
323
324 numbers_len = e - *s;
db30362b 325 if (numbers_len == 0) {
8d1e72f0 326 (*s)++; /* Move past the '}' */
1f4fbd3b
MS
327 *message = "Empty \\o{}";
328 return FALSE;
db30362b
KW
329 }
330
00ce5563 331 *uv = grok_oct(*s, &numbers_len, &flags, NULL);
8d1e72f0
KW
332 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
333 || (! allow_UV_MAX && *uv == UV_MAX)))
334 {
335 *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
fa2251a9 336 *s = rbrace + 1;
8d1e72f0
KW
337 return FALSE;
338 }
339
1b2f32d5
KW
340 while (isBLANK(**s)) {
341 trailing_blanks_len++;
342 (*s)++;
343 }
344
db30362b
KW
345 /* Note that if has non-octal, will ignore everything starting with that up
346 * to the '}' */
1b2f32d5 347 if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) {
8d1e72f0 348 *s += numbers_len;
80f4111b 349 if (strict) {
6dd641e1 350 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
5763c818 351 *message = "Non-octal character";
80f4111b
KW
352 return FALSE;
353 }
8d1e72f0
KW
354
355 if (ckWARN(WARN_DIGIT)) {
356 const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
357 UTF, TRUE);
358 if (packed_warn) {
359 *message = failure;
360 *packed_warn = packWARN(WARN_DIGIT);
361 }
362 else {
363 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
364 }
80f4111b 365 }
db30362b
KW
366 }
367
00ce5563 368 /* Return past the '}' */
fa2251a9 369 *s = rbrace + 1;
00ce5563 370
db30362b
KW
371 return TRUE;
372}
373
ce54a8b9 374bool
8d1e72f0 375Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
5763c818 376 const char** message,
8d1e72f0
KW
377 U32 * packed_warn,
378 const bool strict,
379 const bool allow_UV_MAX,
ce54a8b9
KW
380 const bool UTF)
381{
5e0a247b 382
ce54a8b9 383/* Documentation to be supplied when interface nailed down finally
8d1e72f0 384 * This returns FALSE if there is an error the caller should probably die
ce54a8b9
KW
385 * from; otherwise TRUE.
386 * It guarantees that the returned codepoint, *uv, when expressed as
387 * utf8 bytes, would fit within the skipped "\x{...}" bytes.
388 *
389 * On input:
e8278639
KW
390 * s is the address of a pointer to a string. **s is 'x', and the
391 * previous character was a backslash. At exit, *s will be advanced
392 * to the byte just after those absorbed by this function. Hence the
393 * caller can continue parsing from there. In the case of an error,
394 * this routine has generally positioned *s to point just to the right
395 * of the first bad spot, so that a message that has a "<--" to mark
396 * the spot will be correctly positioned.
397 * send - 1 gives a limit in *s that this function is not permitted to
398 * look beyond. That is, the function may look at bytes only in the
399 * range *s..send-1
ce54a8b9 400 * uv points to a UV that will hold the output value, valid only if the
8d1e72f0
KW
401 * return from the function is TRUE; may be changed from the input
402 * value even when FALSE is returned.
403 * message A pointer to any warning or error message will be stored into
404 * this pointer; NULL if none.
405 * packed_warn if NULL on input asks that this routine display any warning
406 * messages. Otherwise, if the function found a warning, the packed
407 * warning categories will be stored into *packed_warn (and the
408 * corresponding message text into *message); 0 if none.
ce54a8b9
KW
409 * strict is true if anything out of the ordinary should cause this to
410 * fail instead of warn or be silent. For example, it requires
411 * exactly 2 digits following the \x (when there are no braces).
412 * 3 digits could be a mistake, so is forbidden in this mode.
8d1e72f0
KW
413 * allow_UV_MAX is true if this shouldn't fail if the input code point is
414 * UV_MAX, which is normally illegal, reserved for internal use.
ce54a8b9
KW
415 * UTF is true iff the string *s is encoded in UTF-8.
416 */
1b2f32d5 417 char* e;
fa2251a9 418 char * rbrace;
ce54a8b9 419 STRLEN numbers_len;
1b2f32d5 420 STRLEN trailing_blanks_len = 0;
eb761011 421 I32 flags = PERL_SCAN_DISALLOW_PREFIX
8d1e72f0
KW
422 | PERL_SCAN_SILENT_ILLDIGIT
423 | PERL_SCAN_NOTIFY_ILLDIGIT
424 | PERL_SCAN_SILENT_NON_PORTABLE
425 | PERL_SCAN_SILENT_OVERFLOW;
5e0a247b 426
ce54a8b9 427 PERL_ARGS_ASSERT_GROK_BSLASH_X;
5e0a247b 428
7f4ec488
KW
429 assert(*(*s - 1) == '\\');
430 assert(* *s == 'x');
0b97d156 431
8d1e72f0
KW
432 *message = NULL;
433 if (packed_warn) *packed_warn = 0;
434
ce54a8b9 435 (*s)++;
5e0a247b 436
0b97d156
KW
437 if (send <= *s) {
438 if (strict) {
5763c818 439 *message = "Empty \\x";
0b97d156
KW
440 return FALSE;
441 }
442
443 /* Sadly, to preserve backcompat, an empty \x at the end of string is
444 * interpreted as a NUL */
445 *uv = 0;
446 return TRUE;
447 }
448
ce54a8b9 449 if (**s != '{') {
8d1e72f0
KW
450 numbers_len = (strict) ? 3 : 2;
451
1f4fbd3b
MS
452 *uv = grok_hex(*s, &numbers_len, &flags, NULL);
453 *s += numbers_len;
8d1e72f0
KW
454
455 if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
456 if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
5763c818 457 *message = "Use \\x{...} for more than two hex characters";
8d1e72f0
KW
458 return FALSE;
459 }
460 else if (strict) {
461 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
462 *message = "Non-hex character";
463 return FALSE;
464 }
465 else if (ckWARN(WARN_DIGIT)) {
466 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
467 send, UTF, FALSE);
468
469 if (! packed_warn) {
470 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
471 }
472 else {
473 *message = failure;
474 *packed_warn = packWARN(WARN_DIGIT);
475 }
ce54a8b9 476 }
ce54a8b9 477 }
1f4fbd3b 478 return TRUE;
ce54a8b9
KW
479 }
480
fa2251a9
KW
481 rbrace = (char *) memchr(*s, '}', send - *s);
482 if (!rbrace) {
ce54a8b9 483 (*s)++; /* Move past the '{' */
1b2f32d5
KW
484
485 /* Position beyond legal blanks and digits */
486 while (*s < send && isBLANK(**s)) {
ce54a8b9
KW
487 (*s)++;
488 }
1b2f32d5
KW
489
490 while (*s < send && isXDIGIT(**s)) {
491 (*s)++;
492 }
493
1f4fbd3b
MS
494 *message = "Missing right brace on \\x{}";
495 return FALSE;
ce54a8b9
KW
496 }
497
498 (*s)++; /* Point to expected first digit (could be first byte of utf8
499 sequence if not a digit) */
1b2f32d5
KW
500 while (isBLANK(**s)) {
501 (*s)++;
502 }
503
504 e = rbrace;
505 while (*s < e && isBLANK(*(e - 1))) {
506 e--;
507 }
508
509 numbers_len = e - *s;
ce54a8b9
KW
510 if (numbers_len == 0) {
511 if (strict) {
512 (*s)++; /* Move past the } */
5763c818 513 *message = "Empty \\x{}";
ce54a8b9
KW
514 return FALSE;
515 }
fa2251a9 516 *s = rbrace + 1;
ce54a8b9
KW
517 *uv = 0;
518 return TRUE;
519 }
520
521 flags |= PERL_SCAN_ALLOW_UNDERSCORES;
ce54a8b9
KW
522
523 *uv = grok_hex(*s, &numbers_len, &flags, NULL);
8d1e72f0
KW
524 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
525 || (! allow_UV_MAX && *uv == UV_MAX)))
526 {
527 *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
1b2f32d5 528 *s = e + 1;
8d1e72f0
KW
529 return FALSE;
530 }
ce54a8b9 531
1b2f32d5
KW
532 while (isBLANK(**s)) {
533 trailing_blanks_len++;
534 (*s)++;
535 }
536
537 if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) {
ce54a8b9 538 *s += numbers_len;
8d1e72f0
KW
539 if (strict) {
540 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
541 *message = "Non-hex character";
542 return FALSE;
543 }
544
545 if (ckWARN(WARN_DIGIT)) {
546 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
547 send, UTF, TRUE);
548 if (! packed_warn) {
549 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
550 }
551 else {
552 *message = failure;
553 *packed_warn = packWARN(WARN_DIGIT);
554 }
555 }
ce54a8b9
KW
556 }
557
558 /* Return past the '}' */
fa2251a9 559 *s = rbrace + 1;
ce54a8b9
KW
560
561 return TRUE;
5e0a247b
KW
562}
563
04e98a4d 564/*
14d04a33 565 * ex: set ts=8 sts=4 sw=4 et:
04e98a4d 566 */