3 * This file contains functions that are related to
4 * parsing double-quotish expressions.
9 #define PERL_IN_DQUOTE_C
12 /* XXX Add documentation after final interface and behavior is decided */
15 Perl_grok_bslash_c(pTHX_ const char source,
20 PERL_ARGS_ASSERT_GROK_BSLASH_C;
22 /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it
23 * is valid, the sequence evaluates to a single character, which will be
24 * stored into *result.
26 * source is the character immediately after a '\c' sequence.
27 * result points to a char variable into which this function will store
28 * what the sequence evaluates to, if valid; unchanged otherwise.
29 * message A pointer to any warning or error message will be stored into
30 * this pointer; NULL if none.
31 * packed_warn if NULL on input asks that this routine display any warning
32 * messages. Otherwise, if the function found a warning, the
33 * packed warning categories will be stored into *packed_warn (and
34 * the corresponding message text into *message); 0 if none.
38 if (packed_warn) *packed_warn = 0;
40 if (! isPRINT_A(source)) {
41 *message = "Character following \"\\c\" must be printable ASCII";
46 const char control = toCTRL('{');
47 if (isPRINT_A(control)) {
48 /* diag_listed_as: Use "%s" instead of "%s" */
49 *message = Perl_form(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
52 *message = "Sequence \"\\c{\" invalid";
57 *result = toCTRL(source);
58 if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
61 char format[] = "\"\\c%c\" is more clearly written simply as \"%s\"";
63 if (! isWORDCHAR(*result)) {
66 clearer[i++] = *result;
70 *message = Perl_form(aTHX_ format, source, clearer);
71 *packed_warn = packWARN(WARN_SYNTAX);
74 Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
82 Perl_form_alien_digit_msg(pTHX_
83 const U8 which, /* 8 or 16 */
84 const STRLEN valids_len, /* length of input before first bad char */
85 const char * const first_bad, /* Ptr to that bad char */
86 const char * const send, /* End of input string */
87 const bool UTF, /* Is it in UTF-8? */
88 const bool braced) /* Is it enclosed in {} */
90 /* Generate a mortal SV containing an appropriate warning message about
91 * alien characters found in an octal or hex constant given by the inputs,
92 * and return a pointer to that SV's string. The message looks like:
94 * Non-hex character '?' terminates \x early. Resolved as "\x{...}"
98 /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
99 * (leading zeros could take up more space, and the scalar will
100 * automatically grow if necessary). Space for NUL is added by the newSV()
102 SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
103 SV * message_sv = sv_newmortal();
106 PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
107 assert(which == 8 || which == 16);
109 /* Calculate the display form of the character */
110 if ( UVCHR_IS_INVARIANT(*first_bad)
111 || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
113 pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
114 (STRLEN) -1, UNI_DISPLAY_QQ);
116 else { /* Is not UTF-8, or is illegal UTF-8. Show just the one byte */
118 /* It also isn't a UTF-8 invariant character, so no display shortcuts
119 * are available. Use \\x{...} */
120 Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
123 /* Ready to start building the message */
124 sv_setpvs(message_sv, "Non-");
126 sv_catpvs(message_sv, "octal");
131 symbol = '0'; /* \008, for example */
135 sv_catpvs(message_sv, "hex");
138 sv_catpvs(message_sv, " character ");
140 if (isPRINT(*first_bad)) {
141 sv_catpvs(message_sv, "'");
143 sv_catsv(message_sv, display_char);
144 if (isPRINT(*first_bad)) {
145 sv_catpvs(message_sv, "'");
147 Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early. Resolved as "
148 "\"\\%c", symbol, symbol);
150 sv_catpvs(message_sv, "{");
153 /* Octal constants have an extra leading 0, but \0 already includes that */
154 if (symbol == 'o' && valids_len < 3) {
155 sv_catpvs(message_sv, "0");
157 if (valids_len == 0) { /* No legal digits at all */
158 sv_catpvs(message_sv, "00");
160 else if (valids_len == 1) { /* Just one is legal */
161 sv_catpvs(message_sv, "0");
163 sv_catpvn(message_sv, first_bad - valids_len, valids_len);
166 sv_catpvs(message_sv, "}");
169 sv_catsv(message_sv, display_char);
171 sv_catpvs(message_sv, "\"");
173 SvREFCNT_dec_NN(display_char);
175 return SvPVX_const(message_sv);
179 Perl_form_cp_too_large_msg(pTHX_
180 const U8 which, /* 8 or 16 */
181 const char * string, /* NULL, or the text that is supposed to
182 represent a code point */
183 const Size_t len, /* length of 'string' if not NULL; else 0 */
184 const UV cp) /* 0 if 'string' not NULL; else the too-large
187 /* Generate a mortal SV containing an appropriate warning message about
188 * code points that are too large for this system, given by the inputs,
189 * and return a pointer to that SV's string. Either the text of the string
190 * to be converted to a code point is input, or a code point itself. The
191 * former is needed to accurately represent something that overflows.
193 * The message looks like:
195 * Use of code point %s is not allowed; the permissible max is %s
199 SV * message_sv = sv_newmortal();
203 PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
204 assert(which == 8 || which == 16);
206 /* One but not both must be non-zero */
207 assert((string != NULL) ^ (cp != 0));
208 assert((string == NULL) || len);
219 Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
221 Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
224 Perl_sv_catpvf(aTHX_ message_sv, format, cp);
226 Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
227 Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
229 return SvPVX_const(message_sv);
233 Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
234 const char** message,
237 const bool allow_UV_MAX,
241 /* Documentation to be supplied when interface nailed down finally
242 * This returns FALSE if there is an error the caller should probably die
243 * from; otherwise TRUE.
244 * s is the address of a pointer to a string. **s is 'o', and the
245 * previous character was a backslash. At exit, *s will be advanced
246 * to the byte just after those absorbed by this function. Hence the
247 * caller can continue parsing from there. In the case of an error
248 * when this function returns FALSE, continuing to parse is not an
249 * option, this routine has generally positioned *s to point just to
250 * the right of the first bad spot, so that a message that has a "<--"
251 * to mark the spot will be correctly positioned.
252 * send - 1 gives a limit in *s that this function is not permitted to
253 * look beyond. That is, the function may look at bytes only in the
255 * uv points to a UV that will hold the output value, valid only if the
256 * return from the function is TRUE; may be changed from the input
257 * value even when FALSE is returned.
258 * message A pointer to any warning or error message will be stored into
259 * this pointer; NULL if none.
260 * packed_warn if NULL on input asks that this routine display any warning
261 * messages. Otherwise, if the function found a warning, the packed
262 * warning categories will be stored into *packed_warn (and the
263 * corresponding message text into *message); 0 if none.
264 * strict is true if this should fail instead of warn if there are
265 * non-octal digits within the braces
266 * allow_UV_MAX is true if this shouldn't fail if the input code point is
267 * UV_MAX, which is normally illegal, reserved for internal use.
268 * UTF is true iff the string *s is encoded in UTF-8.
272 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
273 | PERL_SCAN_DISALLOW_PREFIX
274 | PERL_SCAN_SILENT_NON_PORTABLE
275 | PERL_SCAN_SILENT_ILLDIGIT
276 | PERL_SCAN_SILENT_OVERFLOW;
278 PERL_ARGS_ASSERT_GROK_BSLASH_O;
280 assert(*(*s - 1) == '\\');
284 if (packed_warn) *packed_warn = 0;
288 if (send <= *s || **s != '{') {
289 *message = "Missing braces on \\o{}";
293 e = (char *) memchr(*s, '}', send - *s);
295 (*s)++; /* Move past the '{' */
296 while (isOCTAL(**s)) { /* Position beyond the legal digits */
299 *message = "Missing right brace on \\o{}";
303 (*s)++; /* Point to expected first digit (could be first byte of utf8
304 sequence if not a digit) */
305 numbers_len = e - *s;
306 if (numbers_len == 0) {
307 (*s)++; /* Move past the '}' */
308 *message = "Empty \\o{}";
312 *uv = grok_oct(*s, &numbers_len, &flags, NULL);
313 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
314 || (! allow_UV_MAX && *uv == UV_MAX)))
316 *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
321 /* Note that if has non-octal, will ignore everything starting with that up
323 if (numbers_len != (STRLEN) (e - *s)) {
326 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
327 *message = "Non-octal character";
331 if (ckWARN(WARN_DIGIT)) {
332 const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
336 *packed_warn = packWARN(WARN_DIGIT);
339 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
344 /* Return past the '}' */
351 Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
352 const char** message,
355 const bool allow_UV_MAX,
359 /* Documentation to be supplied when interface nailed down finally
360 * This returns FALSE if there is an error the caller should probably die
361 * from; otherwise TRUE.
362 * It guarantees that the returned codepoint, *uv, when expressed as
363 * utf8 bytes, would fit within the skipped "\x{...}" bytes.
366 * s is the address of a pointer to a string. **s is 'x', and the
367 * previous character was a backslash. At exit, *s will be advanced
368 * to the byte just after those absorbed by this function. Hence the
369 * caller can continue parsing from there. In the case of an error,
370 * this routine has generally positioned *s to point just to the right
371 * of the first bad spot, so that a message that has a "<--" to mark
372 * the spot will be correctly positioned.
373 * send - 1 gives a limit in *s that this function is not permitted to
374 * look beyond. That is, the function may look at bytes only in the
376 * uv points to a UV that will hold the output value, valid only if the
377 * return from the function is TRUE; may be changed from the input
378 * value even when FALSE is returned.
379 * message A pointer to any warning or error message will be stored into
380 * this pointer; NULL if none.
381 * packed_warn if NULL on input asks that this routine display any warning
382 * messages. Otherwise, if the function found a warning, the packed
383 * warning categories will be stored into *packed_warn (and the
384 * corresponding message text into *message); 0 if none.
385 * strict is true if anything out of the ordinary should cause this to
386 * fail instead of warn or be silent. For example, it requires
387 * exactly 2 digits following the \x (when there are no braces).
388 * 3 digits could be a mistake, so is forbidden in this mode.
389 * allow_UV_MAX is true if this shouldn't fail if the input code point is
390 * UV_MAX, which is normally illegal, reserved for internal use.
391 * UTF is true iff the string *s is encoded in UTF-8.
395 I32 flags = PERL_SCAN_DISALLOW_PREFIX
396 | PERL_SCAN_SILENT_ILLDIGIT
397 | PERL_SCAN_NOTIFY_ILLDIGIT
398 | PERL_SCAN_SILENT_NON_PORTABLE
399 | PERL_SCAN_SILENT_OVERFLOW;
401 PERL_ARGS_ASSERT_GROK_BSLASH_X;
403 assert(*(*s - 1) == '\\');
407 if (packed_warn) *packed_warn = 0;
413 *message = "Empty \\x";
417 /* Sadly, to preserve backcompat, an empty \x at the end of string is
418 * interpreted as a NUL */
424 numbers_len = (strict) ? 3 : 2;
426 *uv = grok_hex(*s, &numbers_len, &flags, NULL);
429 if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
430 if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
431 *message = "Use \\x{...} for more than two hex characters";
435 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
436 *message = "Non-hex character";
439 else if (ckWARN(WARN_DIGIT)) {
440 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
444 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
448 *packed_warn = packWARN(WARN_DIGIT);
455 e = (char *) memchr(*s, '}', send - *s);
457 (*s)++; /* Move past the '{' */
458 while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */
461 *message = "Missing right brace on \\x{}";
465 (*s)++; /* Point to expected first digit (could be first byte of utf8
466 sequence if not a digit) */
467 numbers_len = e - *s;
468 if (numbers_len == 0) {
470 (*s)++; /* Move past the } */
471 *message = "Empty \\x{}";
479 flags |= PERL_SCAN_ALLOW_UNDERSCORES;
481 *uv = grok_hex(*s, &numbers_len, &flags, NULL);
482 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
483 || (! allow_UV_MAX && *uv == UV_MAX)))
485 *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
490 if (numbers_len != (STRLEN) (e - *s)) {
493 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
494 *message = "Non-hex character";
498 if (ckWARN(WARN_DIGIT)) {
499 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
502 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
506 *packed_warn = packWARN(WARN_DIGIT);
511 /* Return past the '}' */
518 * ex: set ts=8 sts=4 sw=4 et: